近期总结
最近驱动自己学习scrapy的动力主要是,后面可能找与python相关的工作。首先是接触了flask,毕竟自己之前是做javaweb后台,部分概念不离根,从web比较好上手。跟着B站的一个flask教程学习并完成了功能项目地址。项目的数据是调用了http接口获取的,数据来源是豆瓣。项目做完后我便想了解下爬虫,试试把豆瓣的图书数据爬取下来。学习scrapy主要是看了官方文档,参考Google到的一些博客demo。目前了解到的scrapy核心部分应该是spider和pipeline,spider主要对爬取页面进行解析,pipeline中负责入库,过滤数据等操作。以下正文,作为学习scrapy的总结。
scrapy爬取豆瓣图书
爬取逻辑
豆瓣书籍,例如《白夜行》的详情页面如下:
书籍信息包含了一本书的主要信息,如:作者,出版社等。其中最重要的是isbn,每本书都有唯一的isbn号,后面的爬虫也需要用isbn来进行过滤。
在爬取该页面的时候,先解析书籍信息。再解析喜欢读下面的数据,这样就可以在整个豆瓣读书进行爬取。
具体操作
item:
class DouBanBookItem(scrapy.Item):
author = scrapy.Field()#response.css('#info > a:nth-child(2)::attr(href)').extract_first()
binding = scrapy.Field()#response.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first()
publisher = scrapy.Field()#response.xpath(u'//span[contains(./text(), "出版社:")]/following::text()[1]').extract_first()
price = scrapy.Field()# response.xpath('//*[@id="info"]/span[8]/following::text()[1]').extract_first()
pages = scrapy.Field()# response.xpath('//*[@id="info"]/span[7]/following::text()[1]').extract_first()
pubdate = scrapy.Field() #response.xpath('//*[@id="info"]/span[6]/following::text()[1]').extract_first()
isbn = scrapy.Field() # response.xpath('//*[@id="info"]/span[11]/following::text()[1]').extract_first()
summary = scrapy.Field() #response.css('#link-report > div:nth-child(1) > div > p::text').extract()
image = scrapy.Field()# response.css('#mainpic > a > img::attr(src)').extract_first()
title = scrapy.Field() # response.css('#wrapper > h1 > span::text').extract_first()
item主要是定义书籍相关的字段,这个根据自己要哪些数据自行定义
spider:
# -*- coding: utf-8 -*-
import scrapy
from lear_scrapy.items import DouBanBookItem
class DoubanbookSpider(scrapy.Spider):
name = 'doubanbook'
allowed_domains = ['book.douban.com']
start_urls = ['https://book.douban.com/subject/26944962/']
def parse(self, response):
item = DouBanBookItem()
item['author'] = fix_author(response)
item['binding'] = fix_field(response.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first())
item['publisher'] = fix_field(response.xpath(
u'//span[contains(./text(), "出版社:")]/following::text()[1]').extract_first())
item['price'] = fix_field(response.xpath(
u'//span[contains(./text(), "定价:")]/following::text()[1]').extract_first())
item['pages'] = fix_field(response.xpath(
u'//span[contains(./text(), "页数:")]/following::text()[1]').extract_first())
item['pubdate'] = fix_field(response.xpath(
u'//span[contains(./text(), "出版年:")]/following::text()[1]').extract_first())
item['isbn'] = fix_field(response.xpath(
u'//span[contains(./text(), "ISBN:")]/following::text()[1]').extract_first())
item['summary'] = fix_summary(response)
item['image'] = fix_field(response.css('#mainpic > a > img::attr(src)').extract_first())
item['title'] = fix_field(response.css('#wrapper > h1 > span::text').extract_first())
yield item
likes = response.css('#db-rec-section > div > dl')
for like in likes:
like_url = like.css('dt > a::attr(href)').extract_first()
if like_url:
yield scrapy.Request(url=like_url, callback=self.parse)
def fix_field(field):
return field.strip() if field else ''
def fix_author(response):
# 不同页面的author html有所不同
author = response.css('#info > a:nth-child(2)::text').extract_first()
if not author:
author = response.css('#info > span > a::text').extract_first()
# 部分书籍如一千零一夜,没有作者
return author.replace('\n ', '').strip() if author else '无'
def fix_summary(response):
summary_list = response.css('#link-report > div:nth-child(1) > div > p::text').extract()
summary = ''
for s in summary_list:
summary += s
return summary
以上代码关键在与parse()
函数的解析过程,中间涉及到css和xpath定位数据,我也调了比较长时间
pipeline:
pipeline有三个,按照调用顺序不同,他们的作用依次是:
- DuplicatePipeline:通过书籍的isbn号去redis中查询,如果能查到,说明书籍已经爬取过了,不再爬取;如果查不到,将isbn号存入redis,继续爬取。
- DoubanBookPipeline:通过mysql入库
- DoubanBookImagePipeline:通过继承scrapy自带的ImagesPipeline实现图片下载(需要在setting中配置
IMAGES_STORE = 'D:\\pythonSpace\\douabnmovie_image'
来指定图片保存路径)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from lear_scrapy.items import DouBanBookItem
from lear_scrapy.util.redisclient import RedisClient
class DuplicatePipeline(object):
def __init__(self):
self.redisclient = RedisClient()
def process_item(self, item, spider):
if self.redisclient.is_isbn_exist(item['isbn']):
raise DropItem('书籍:' + item['title'] + ',已经爬取')
else:
self.redisclient.save_isbn(item['isbn'])
return item
class DoubanBookPipeline(object):
def __init__(self):
self.connection = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='yushu'
)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
insert_sql = 'insert into doubanbook(author,binding,publisher,price,pages,pubdate,isbn,summary,image,title)' \
' values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
if isinstance(item, DouBanBookItem):
values = (item['author'], item['binding'], item['publisher'], item['price'], item['pages'],
item['pubdate'], item['isbn'], item['summary'], item['image'], item['title'])
self.cursor.execute(insert_sql, values)
self.connection.commit()
return item
class DoubanBookImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
if item['image']:
yield scrapy.Request(item['image'])
# def item_completed(self, results, item, info):
# image_paths = [x['path'] for ok, x in results if ok]
# if not image_paths:
# raise DropItem("Item contains no files")
# item['image_paths'] = image_paths
# return item
URL过滤器:
除了过滤重复isbn的书籍以外,还要考虑,对部分已经爬取的页面,不再进行爬取。这通过继承scrapy的RFPDupeFilter
来实现,内部也是用了redis来保存爬取的url
from lear_scrapy.util.redisclient import RedisClient
class UrlFilter(RFPDupeFilter):
redisclient = RedisClient()
def request_seen(self, request):
if UrlFilter.redisclient.is_url_crawled(request.url):
return True
else:
UrlFilter.redisclient.add_url(request.url)
上面的功能要启动,需要在setting中配置DUPEFILTER_CLASS = 'lear_scrapy.filter.urlfilter.UrlFilter'
以上者两天折腾的主要东西,爬虫也基本可以使用。