目录
- 只用request爬取
- 用多线程爬取
- 用多进程爬取
- 用协程爬取
- 用协程+多进程爬取
1. 只用request爬取
花费时间:139.34340秒
代码:
# coding=utf8
import requests
from bs4 import BeautifulSoup
import time
def do_task(domain, pageUrl):
response = requests.get(pageUrl)
if response.status_code != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, response.status_code))
soup = BeautifulSoup(response.content, 'html.parser')
for h in soup.select('h3>a'):
url = ''.join([domain, h.get('href')])
html = requests.get(url)
print('url:{} title:{}'.format(url, parse_text(html)))
def parse_text(html):
soup = BeautifulSoup(html.content, 'html.parser')
return str(soup.select('.shici-title')[0].get_text())
def main():
domain = 'http://www.shicimingju.com'
urlTemplate = domain + '/chaxun/zuozhe/9_{0}.html'
pageNum = 50 # 读取50页诗词进行测试
for num in range(pageNum):
do_task(domain, urlTemplate.format(num + 1))
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time()-start))
2.使用多线程
花费时间:42.83238秒
代码:
# coding=utf8
import requests
from bs4 import BeautifulSoup
import time
import threading
def do_task(domain, pageUrl):
response = requests.get(pageUrl)
if response.status_code != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, response.status_code))
soup = BeautifulSoup(response.content, 'html.parser')
for h in soup.select('h3>a'):
url = ''.join([domain, h.get('href')])
html = requests.get(url)
print('url:{} title:{}'.format(url, parse_text(html)))
def parse_text(html):
soup = BeautifulSoup(html.content, 'html.parser')
return str(soup.select('.shici-title')[0].get_text())
def main():
domain = 'http://www.shicimingju.com'
urlTemplate = domain + '/chaxun/zuozhe/9_{0}.html'
pageNum = 50 # 读取50页诗词进行测试
threads = []
for num in range(pageNum):
num += 1
# 开50个线程
t = threading.Thread(target=do_task, name='LoopThread' + str(num), args=(domain, urlTemplate.format(num)))
threads.append(t)
for t in threads:
t.start() # 启动线程
for t in threads:
t.join() # 同步线程
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time()-start))
3. 多进程
CPU 8 核
4进程 总耗时:49.22848秒
8进程 总耗时:22.08792秒
代码:
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import time
def do_task(domain, pageUrl):
response = requests.get(pageUrl)
if response.status_code != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, response.status_code))
soup = BeautifulSoup(response.content, 'html.parser')
for h in soup.select('h3>a'):
url = ''.join([domain, h.get('href')])
html = requests.get(url)
print('url:{} title:{}'.format(url, parse_text(html)))
def parse_text(html):
soup = BeautifulSoup(html.content, 'html.parser')
return str(soup.select('.shici-title')[0].get_text())
def main():
p = Pool(8) # 我的CPU是八核的就用8个进程
domain = 'http://www.shicimingju.com'
urlTemplate = domain + '/chaxun/zuozhe/9_{0}.html'
pageNum = 50 # 读取50页诗词进行测试
for num in range(pageNum):
p.apply_async(do_task, args=(domain, urlTemplate.format(num + 1)))
p.close()
p.join() # 运行完所有子进程才能顺序运行后续程序
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time() - start))
4. 协程
总耗时:35.39297秒
代码:
from bs4 import BeautifulSoup
import time
import aiohttp
import asyncio
async def do_task(domain, pageUrl):
async with aiohttp.ClientSession() as session:
async with session.request('GET', pageUrl) as resp:
if resp.status != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, resp.status))
html = await resp.read() # 可直接获取bytes
soup = BeautifulSoup(html, 'html.parser')
for h in soup.select('h3>a'):
url = ''.join([domain, h.get('href')])
async with aiohttp.ClientSession() as session:
async with session.request('GET', url) as resp:
if resp.status != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, resp.status))
html = await resp.read() # 可直接获取bytes
print('url:{} title:{}'.format(url, parse_text(html)))
def parse_text(html):
soup = BeautifulSoup(html, 'html.parser')
return str(soup.select('.shici-title')[0].get_text())
def main():
domain = 'http://www.shicimingju.com'
urlTemplate = domain + '/chaxun/zuozhe/9_{0}.html'
pageNum = 50 # 读取50页诗词进行测试
loop = asyncio.get_event_loop() # 获取事件循环
tasks = []
for num in range(pageNum):
tasks.append(do_task(domain, urlTemplate.format(num + 1)))
loop.run_until_complete(asyncio.wait(tasks)) # 协程
loop.close()
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time() - start))
5. 用协程+多进程
CPU 8核
4进程 总耗时:27.81880秒
8进程 总耗时:26.87100秒
代码:
from multiprocessing import Pool
from bs4 import BeautifulSoup
import time
import aiohttp
import asyncio
html_contents = {}
async def do_task(domain, pageUrl):
async with aiohttp.ClientSession() as session:
async with session.request('GET', pageUrl) as resp:
if resp.status != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, resp.status))
html = await resp.read() # 可直接获取bytes
soup = BeautifulSoup(html, 'html.parser')
for h in soup.select('h3>a'):
url = ''.join([domain, h.get('href')])
async with aiohttp.ClientSession() as session:
async with session.request('GET', url) as resp:
if resp.status != 200:
raise Exception('http error, url:{} code:{}'.format(pageUrl, resp.status))
html = await resp.read() # 可直接获取bytes
html_contents[url] = html
def parse_text(url, html):
soup = BeautifulSoup(html, 'html.parser')
title = str(soup.select('.shici-title')[0].get_text())
print(url, title, flush=True)
def main():
domain = 'http://www.shicimingju.com'
urlTemplate = domain + '/chaxun/zuozhe/9_{0}.html'
pageNum = 50 # 读取50页诗词进行测试
loop = asyncio.get_event_loop() # 获取事件循环
# 协程抓取网页内容 缺点是需要额外存储 本来用yield迭代,但在协程里面不知道怎么写 留坑 以后优化
tasks = []
for num in range(pageNum):
tasks.append(do_task(domain, urlTemplate.format(num + 1)))
loop.run_until_complete(asyncio.wait(tasks)) # 协程
loop.close()
# 多进程解析
p = Pool(8) # 我的CPU是八核的就用8个进程
for url, html in html_contents.items():
p.apply_async(parse_text, args=(url, html))
p.close()
p.join() # 运行完所有子进程才能顺序运行后续程序
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time() - start))
6. 总结
没有做任何处理的request最差,这个毋庸置疑。
对爬取网页这种IO密集型来说:
多进程(8个进程)看起来似乎是速度最佳的,但是他是耗费电脑大量资源(CPU/内存等)的情况下达到的速度;多线程速度次之,资源也相对较少(在python只能使用单核),而单独而言,协程是单独一个线程的情况下,三者中速度表现最好的。
最后做了一个实验,是通过协程抓取网页,然后用多进程处理,这个速度也还可以,IO密集型用协程,CPU密集型用协程已经没什么用了,所有只能用多进程来处理。
但在此解析url用到的时间所占比例确实太小,因此此处对性能的提高并非特别明显。