初次尝试写一个小的爬虫,对网站的图片进行抓取
#!/usr/bin/env python3
# -*-coding:UTF-8-*-
import os
import time
import os.path
import urllib.parse
import urllib.request
from io import StringIO
from lxml import etree
class DownloadMM(object):
"""
从网站下载图片
"""
def __init__(self, dir='./ooxx', nPage=10):
self.url = 'https://jandan.net/ooxx'
self._page = 0
self.headers = {'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, '
r'like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
self.nPage = nPage
self.dir = dir
if os.path.isdir(self.dir):
pass
else:
os.mkdir(self.dir)
# 添加一个默认的头信息
self._htmlParser = etree.HTMLParser(remove_comments=True, compact=True, encoding='UTF-8')
def __str__(self):
return '下载图片的地址:' + self.url
__repr__ = __str__
# 下载页面
def _downloadOnePage(self, page):
# 第几页的格式
# //jandan.net/ooxx/page-300#comments
_url = self.url if page <= 0 else self.url + '/page-' + str(page) + '#comments'
try:
_request = urllib.request.Request(_url, headers=self.headers)
_response = urllib.request.urlopen(_request)
_html = _response.read().decode('UTF-8')
return _html
except:
print('下载页面出错了,地址为:', _url)
return ''
# 添加头信息
def _addHeader(self, key, value):
self.headers[key] = value
# 解析网页,获取图片,获取下一页的页码
def _resolvePage(self, html):
root = etree.parse(StringIO(html), self._htmlParser)
# 获取下一页的页码
nextPageLit = root.xpath('//div[@class="comments"][1]//a[@class="previous-comment-page"]/@href')
if nextPageLit:
self._page = int(nextPageLit[0].split('#')[0].split('-')[1])
else:
self._page = -1 # -1 控制退出
# 获取本夜图片的链接地址
onePageImageList = root.xpath('//ol[@class="commentlist"]/li//div[@class="text"]/p/a[1]/@href')
print(onePageImageList)
return onePageImageList
# 下载图片
def _downloadImg(self, imgList):
for imgUrl in imgList:
imgUrl = 'http:' + imgUrl
# 获取文件的名称
fileName = os.path.join(os.path.abspath(self.dir), imgUrl[imgUrl.rfind(
'/') + 1:]) # os.path.abspath(self.dir) + '/' + imgUrl[imgUrl.rfind('/')+1:]
# 读出文件,并存到磁盘
response = urllib.request.urlopen(imgUrl)
imgBytes = response.read()
with open(fileName, 'wb') as f:
f.write(imgBytes)
# 图片下载完后,睡5秒
time.sleep(5)
# 下载图片的逻辑
def _process(self):
html = self._downloadOnePage(self._page) # 下载页面,返回的页面源码
onePageImgList = self._resolvePage(html=html) # 解析,获取图片地址和下一页的页码
self._downloadImg(onePageImgList) # 进行下载图片的操作,没下载一个图片,睡眠5秒
# 获取前多少页的图片, 默认10页
def run(self):
i = 1
while self._page != -1:
if i > self.nPage:
return
print('第', i, '页')
self._process()
i = i + 1
if __name__ == '__main__':
dm = DownloadMM(nPage=1)
dm.run()