前一篇文章介绍了下载全站图片,有同学问,能不能只下载某一个关键词的。
这个要灵活一点,不用框架,直接用urllib。
#coding=utf-8
from bs4 import BeautifulSoup
import urllib
import requests
import os
from multiprocessing.pool import Pool
import multiprocessing
def getAllUrl(url,m,n): #m为首页码,初始为2,n为终页码,为227
urllist = []
for x in range(m,n + 1):
print("get page" + str(x))
html_doc = urllib.urlopen(url + str(x) ).read().decode('utf-8')
soup = BeautifulSoup(html_doc, "lxml")
mm = soup.find("div", id='maincontent')
my = mm.find_all("a", target = "_blank")
for u in my:
if len(u.get("href")) < 40:
urllist.append(u.get("href"))
return list(set(urllist))
def downlaodimg(urls):
girls = len(urls)
for i in range(0,girls):
html_doc = urllib.urlopen(urls[i]).read().decode('utf-8')
soup = BeautifulSoup(html_doc, "lxml")
for imgs in soup.find_all('p'):
imglist = imgs.find_all('img')
for u in imglist:
try:
img_src = u.get("src")
pic_name = (u.get("alt"))+ "1" + '.jpg'
urllib.urlretrieve(img_src, pic_name)
print(u.get("alt"))
except:
continue
girls += 1
#这里可以自主指定关键词,需要的起始页和终止页,
allpage = getAllUrl("http://www.mm29.com/tag/甜美/",2,4)
print(allpage)
print("get page" +str(len(allpage)))
downlaodimg(allpage)