import queue
import threading
import requests
import re
from lxml import etree
import time
import random
import json
# 已爬 url
urlList = []
# 正在爬url对列
urlsData = queue.Queue()
# urlERRor失败次数
urlError = {}
# 第几个爬虫
count = 0
# 模拟header头
header = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
class Counter(threading.Thread):
# @summary: 初始化对象。
# @param lock: 琐对象。
# @param threadName: 线程名称
# @param requests: 线程名称
# @param url: 爬取来源url
# @param name: 数据名称
# @param id: 数据id
def __init__(self, lock, threadName, requests, url):
print(threadName+'run..')
super(Counter, self).__init__(name=threadName)
self.lock = lock
self.requests = requests
self.url = url
def _data_get(self):
# 开始任务
try:
# 爬取来源地址
html = requests.get(self.url,headers=header)
rs = etree.HTML(html.content)
# 解析网页百度地址
url = re.findall(r'href="(https://pan.baidu.com/s/.*?|http://pan.baidu.com/s/.*?)"',html.content.decode('utf-8'))
# 解析网页百度云密码
password = re.findall(r'密码(:|;|: )(\w{0,4})', html.content.decode('utf-8'))
name = rs.xpath('//h1/text()')
# 打印
try:
password = password[0][1]
except BaseException as e:
password = ''
# 爬取豆瓣电影封面图
try:
url1 = "http://www.baidu.com/s?"
html = requests.get(url1,params={
'wd':"site:movie.douban.com {}".format(self.name)
})
select = etree.HTML(html.content)
# saveHtml("text1", html.content)
a = select.xpath('//h3[@class="t"]/a/@href')
html = requests.get(a[0])
select = etree.HTML(html.content)
# print(html.content)
ase = select.xpath('//img/@src')
img = ase[0]
except BaseException as e:
print(self.name,'豆瓣电影封面获取失败')
img = ''
# 提交数据
print(name[0])
# 提交数据
rr = requests.post('http://localhost/basic/index.php?r=bian/update', {
'password': password,
'url': url[0],
'img': img,
'source_url': self.url,
'name': name[0]
})
threadmax.release()
print(rr.content)
# message = '''
# '%s','%s','%s','%s';
# ''' % (password, url[0], img, name[0])
# print(message)
except BaseException as e:
if self.url in urlError:
urlError[self.url] = urlError[self.url] + 1
else:
urlError[self.url] = 1
if urlError[self.url]<3:
urlsData.put(self.url)
print('百度云地址解析失败',self.url,'失败次数',urlError[self.url],e)
print('目前剩余任务', urlsData.qsize())
def run(self):
global count
self.lock.acquire()
self._data_get()
self.lock.release()
if __name__ == '__main__':
threadmax = threading.BoundedSemaphore(100)
lock = threading.Lock()
i = 0
try:
# 单进程爬所有任务url
for index1 in range(20):
index = 1038 - index1
html = requests.get('http://www.xiexingeini.com/page/{}'.format(index), headers=header)
html = etree.HTML(html.content)
# 所有任务
urls = html.xpath('//header/h2[@class="entry-title"]/a/@href')
for url in range(len(urls)):
urlsData.put(urls[url])
print('已抓取url',urlsData.qsize())
print('全部任务:',urlsData.qsize())
# 对列循环爬取
while True:
threads = []
uu = urlsData.get()
i = i+1
try:
threadmax.acquire()
ts = Counter(lock, "thread-" + str(i), requests, uu).start()
except BaseException as e:
print(e)
# 重新插入对列
urlsData.put(uu)
if e == "can't start new thread":
print('线程开启失败')
time.sleep(180)
else:
print(uu,'error')
except BaseException as e:
print('url error')
print(e)
# while True:
# # 添加数据:吃
# q1.put('a')
# q1.put('b')
#
# # 打印队列中的数据
# print(q1.queue)
#
# # 取出队列中的数据:先进先出原则
# print(q1.get())
# print(q1.queue)
# print(q1.qsize())
# print(q1.get())
# 当队列里没有数据是,get获取不到到数据,会造成阻塞
python多线程爬取百度云电影网站
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 前面那篇爬虫文章用的是单线程没有用到其它一些比较提高效率的工具比较遗憾,所以今天做了一个比较全面的爬虫。首先谢谢 ...