Creating a project
scrapy startproject top_250
Source
- in spiders/top_250.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request, Spider
from top_250.items import Top250Item
class MovieSpider(Spider):
name = 'top_250'
allowed_domains = ['movie.douban.com'] # don't add any `http` or `https`
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
def start_requests(self):
url = 'https://movie.douban.com/top250'
yield Request(url, headers=self.headers)
def parse(self, response):
item = Top250Item()
for movie in response.xpath("//div[@class='item']"):
item['movie_name'] = movie.xpath(".//a/span[@class='title']/text()").extract_first()
item['movie_url'] = movie.xpath(".//a/@href").extract_first()
item['movie_rank'] = movie.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract_first()
yield item
next_page_url = response.xpath("//div[@class='paginator']/span[@class='next']/a/@href").extract_first()
if next_page_url:
next_page_url = 'https://movie.douban.com/top250' + next_page_url
yield Request(next_page_url, headers=self.headers)
- in items.py
import scrapy
class Top250Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movie_name = scrapy.Field()
movie_url = scrapy.Field()
movie_rank = scrapy.Field()
- in pipelines.py
# -*- coding: utf-8 -*-
import codecs
import json
import os
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class Top250Pipeline(object):
def __init__(self):
self.file = codecs.open('top_250.json', 'w', encoding='utf-8')
self.file.write('[')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(line+',')
return item
def close_spider(self, spider):
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
- in setting.py
...
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
...
...
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'top_250.pipelines.Top250Pipeline': 300,
}
...
FEED_EXPORT_ENCODING = 'utf-8' # use utf-8 to store Chinese
Run our spider
scrapy crawl top_250
生成的文件在运行目录下: top_250.json