在 windows 10 下安装 scrapy ,首先安装好 anaconda , 直接运行 conda install scrapy , anaconda 会帮你解决安装过程所需要的库和文件, 安装成功
爬取 http://quotes.toscrape.com/ 网址的内容,注意跳转下一页
# quotes.py
# -*- coding: utf-8 -*-
import scrapy
from quotetutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css(".quote")
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
#正则取出下一页的 url
next = response.css(".pager .next a::attr(href)").extract_first()
#取出的url是相对路径,用下面函数合成一个绝对 url
url = response.urljoin(next)
#设置回调函数,递归传递下一页的url,实现下一页爬取
yield scrapy.Request(url=url, callback=self.parse)
定义 item 列表处理数据
#items.py
import scrapy
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
定义 pipeline ,处理数据
#pipeline.py
import pymongo
from scrapy.exceptions import DropItem
class TextPipeline(object):
def __init__(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
#丢弃Item
return DropItem('Missing Text')
#定义函数内变量
class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
#从 setting 中传入设定的 MONGO_URL 和 MONGO_DB,
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_url = crawler.settings.get('MONGO_URL'),
mongo_db = crawler.settings.get('MONGO_DB')
)
#当 spider 开始时运行
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
#存入 mongodb
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
#当关闭 spider 时运行
def close_spider(self, spider):
self.client.close()
setting 文件定义的变量
#setting.py
MONGO_URL = 'localhost'
MONGO_DB = 'quotestutorial'