setting.py
# -*- coding: utf-8 -*-
import scrapy_redis
BOT_NAME = 'CrawlWithRedis'
SPIDER_MODULES = ['CrawlWithRedis.spiders']
NEWSPIDER_MODULE = 'CrawlWithRedis.spiders'
ITEM_PIPELINES = {'CrawlWithRedis.pipelines.CrawlWithRedisPipeline':300}
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Redis 数据库设置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
# MongonDB 设置
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'XiaoYunKeji'
MONGODB_DOCNAME = 'daomubiji'
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item, Field
class CrawlWithRedisItem(Item):
bookName = Field()
bookTitle = Field()
chapterNum = Field()
chapterName = Field()
chapterURL = Field()
text = Field()
pipelines.py
# -*- coding: utf-8 -*-
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
from scrapy.conf import settings
from CrawlWithRedis.items import CrawlWithRedisItem
import pymongo
class CrawlWithRedisPipeline(object):
def __init__(self):
# 初始化 mongodb 数据库
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_NAME']
# 连接
db = pymongo.MongoClient(host=host, port=port)
self.post = db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
# 将 item 转换为 dict
item_info = dict(item)
# 插入记录
self.post.insert(item_info)
return item
spiders.py
#-*- coding:utf-8 -*-
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy_redis.spiders import RedisSpider
from CrawlWithRedis.items import CrawlWithRedisItem
class spider(RedisSpider):
name = "CrawlWithRedis"
redis_key = 'CrawlWithRedis: start_urls'
start_urls = ['http://daomubiji.com/']
def parse(self, response):
selector = Selector(response)
tables = selector.xpath('//table')
for each_table in tables:
bookName = each_table.xpath('tr/td[@colspan="3"]/center/h2/text()').extract_first()
contents = each_table.xpath('tr/td/a/text()').extract()
urls = each_table.xpath('tr/td/a/@href').extract()
for i in range(len(urls)):
item = CrawlWithRedisItem()
item['bookName'] = bookName
item['chapterURL'] = urls[i]
try:
item['bookTitle'] = contents[i].split(' ')[0]
item['chapterNum'] = contents[i].split(' ')[1]
except Exception as e:
continue
try:
item['chapterName'] = content[i].split(' ')[2]
except Exception as e:
item['chapterName'] = content[i].split(' ')[1][-3:]
yield Request(urls[i], callback=self.parseContent, meta={'item':item})
def parse_Content(self, response):
# 把上面的 item 传递下来
item = response.meta['item']
selector = Selector(response)
contents = selector.xpath('//div[@class="content"]/p/text()').extract()
text = "\n".join(contents)
item['text'] = text
yield item
目前版本 scrapy 1.2 , Python 3.5 运行报错。scrapy github 地址。 暂时未搜索到解决办法, 网上说可能是因为版本问题。改天在 CentOS 上测试下。