用一段代码测试线程、进程、协程的抓取速度:
# -*- coding: utf-8 -*-
import time
import asyncio
import aiohttp
import requests
import threading
import multiprocessing
from multiprocessing import Process
from multiprocessing.dummy import Pool as ThreadPool
OPTION = {
"COROUTINE": 0,
"SINGLE_THREAD": 0,
"MULTI_THREADS": 0,
"MULTI_THREADS_COROUTINE": 0,
"MULTI_PROCESSES": 0,
"MUTL_PROCESSES_COROUTINE": 0
}
urls = []
def getsource(url):
_ = requests.get(url)
return
async def agetsource(url):
async with aiohttp.request("GET", url) as response:
await response.text()
return
def singleThread():
for url in urls:
getsource(url)
def multithreads():
pool = ThreadPool(4)
_ = pool.map(getsource, urls)
pool.close()
pool.join()
def multiprocesses():
pool = multiprocessing.Pool(processes=4)
for url in urls:
pool.apply_async(getsource, (url,))
pool.close()
pool.join()
async def amain(index, pool_size):
loop = asyncio.get_event_loop()
start_index = index * int(len(urls) / pool_size)
end_index = min(len(urls), start_index + int(len(urls) / pool_size))
for url in urls[start_index:end_index]:
_ = loop.create_task(agetsource(url))
while (len(asyncio.all_tasks(loop)) > 1):
await asyncio.sleep(2)
def main(index, pool_size):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# loop = asyncio.get_event_loop()
loop.run_until_complete(amain(index, pool_size))
def mutithreads_coroutine():
threads = []
for index in range(4):
threads.append(threading.Thread(target=main, args=(index, 4,)))
for index in range(4):
threads[index].start()
for index in range(4):
threads[index].join()
def multiprocesses_coroutine():
processes = []
for index in range(4):
processes.append(Process(target=main, args=(index, 4,)))
for index in range(4):
processes[index].start()
for index in range(4):
processes[index].join()
if __name__ == "__main__":
for option in OPTION:
OPTION[option] = 1
factor = 1
start_time = time.time()
urls.clear()
for _ in range(50):
newpage = 'http://www.baidu.com/'
urls.append(newpage)
# 单线程
if OPTION["SINGLE_THREAD"]:
singleThread()
# 多线程
if OPTION["MULTI_THREADS"]:
multithreads()
# 多进程
if OPTION["MULTI_PROCESSES"]:
multiprocesses()
# 单线程+协程
if OPTION["COROUTINE"]:
main(0, 1)
# 多线程 + 协程
if OPTION["MULTI_THREADS_COROUTINE"]:
mutithreads_coroutine()
# 多进程 + 协程
if OPTION["MUTL_PROCESSES_COROUTINE"]:
multiprocesses_coroutine()
end_time = time.time()
print(f"Time consuming for option <{list(filter(lambda x : OPTION[x], OPTION))[0]}> = {factor * (end_time - start_time)}")
OPTION[option] = 0
我的15款MacBook Pro跑出来的结果,办公室的网一般:
Time consuming for option <COROUTINE> = 8.015891075134277
Time consuming for option <SINGLE_THREAD> = 35.00409913063049
Time consuming for option <MULTI_THREADS> = 10.310127973556519
Time consuming for option <MULTI_THREADS_COROUTINE> = 8.017142057418823
Time consuming for option <MULTI_PROCESSES> = 9.180757999420166
Time consuming for option <MUTL_PROCESSES_COROUTINE> = 8.016705989837646