主要使用技术:
1.selenium+chrome无头浏览器
2.pyquery解析网页
3.多进程
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Pool
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
def open_url(url):
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.page-numbers.dots +a.page-numbers"))
)
html=driver.page_source
driver.close()
return html
def download_image(url):
print("正在下载")
try:
response=requests.get(url)
if response.status_code==200:
save_image(response.content)
#return response.content
return None
except RequestException:
print("%s下载失败"%url)
return None
def save_image(content):
print("正在保存")
file_path='{0}/{1}.{2}'.format(os.getcwd()+'\\picture',md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as fp:
fp.write(content)
fp.close()
def main(pagenum):
print("正在解析第%s页数据" % str(pagenum))
url = 'https://www.mzitu.com/jiepai/comment-page-' + str(pagenum)
res = pq(url=url, opener=open_url)
for i in res('p>img').items():
print(i.attr('data-original'))
download_image(i.attr('data-original'))
if __name__=='__main__':
pagenum=1
url = 'https://www.mzitu.com/jiepai/comment-page-'+str(pagenum)
res=pq(url=url, opener=open_url)
page_count = res('span.page-numbers.dots +a.page-numbers').text()
print('page_count:%s' % page_count)
pool = Pool(5)
pool.map(main, [i+1 for i in range(int(page_count))])