本文以最简单的形式使用Scrapy,抓取PM2.5数据,然后将数据存储进MongoDB或TXT文件
下面是非常简单的代码,试着自己写写,如果写不出,建议还需要回去扎实自己的基本功
spider code
# -*- coding: utf-8 -*-
import scrapy
from pm25.items import Pm25Item
import re
class InfospSpider(scrapy.Spider):
name = "infosp"
allowed_domains = ["pm25.com"]
start_urls = ['http://www.pm25.com/rank/1day.html', ]
custom_settings = {'ITEM_PIPELINES':{
'pm25.pipelines.MongodbPipeline': 30, #Pipelines开关
# 'pm25.pipelines.TxtPipeline': 50,
}
}
def parse(self, response):
item = Pm25Item()
re_time = re.compile("\d+-\d+-\d+")
date = response.xpath("/html/body/div[4]/div/div/div[2]/span").extract()[0]
#单独解析出DATE
selector = response.selector.xpath("/html/body/div[5]/div/div[3]/ul[2]/li")
#从response里确立解析范围
for subselector in selector:
#通过范围逐条解析
try:
#防止[0]报错
rank = subselector.xpath("span[1]/text()").extract()[0]
quality = subselector.xpath("span/em/text()")[0].extract()
city = subselector.xpath("a/text()").extract()[0]
province = subselector.xpath("span[3]/text()").extract()[0]
aqi = subselector.xpath("span[4]/text()").extract()[0]
pm25 = subselector.xpath("span[5]/text()").extract()[0]
except IndexError:
print(rank,quality,city,province,aqi,pm25)
item['date'] = re_time.findall(date)[0]
item['rank'] = rank
item['quality'] = quality
item['province'] = city
item['city'] = province
item['aqi'] = aqi
item['pm25'] = pm25
yield item
items code
# -*- coding: utf-8 -*-
import scrapy
class Pm25Item(scrapy.Item):
#最常规的写法
date = scrapy.Field()
rank = scrapy.Field()
quality = scrapy.Field()
province = scrapy.Field()
city = scrapy.Field()
aqi = scrapy.Field()
pm25 = scrapy.Field()
pass
pipelins code
import time
from scrapy.conf import settings
import pymongo
class TxtPipeline(object):
#将数据写txt文件
def process_item(self, item, spider):
today = time.strftime("%y%m%d",time.localtime())
fname = str(today) + ".txt"
with open(fname,"a",encoding="utf-8") as f:
f.write(item["date"] +"," +
item["rank"] +"," +
item["quality"] +"," +
item["province"] +"," +
item["city"] +"," +
item["aqi"] +"," +
item["pm25"] +
"\n"
)
f.close()
return item
class MongodbPipeline(object):
#将数据写入MongoDB
#以下链接参数写在settings中
def __init__(self):
client = pymongo.MongoClient(settings["MONGODB_SERVER"],
settings["MONGODB_PORT"]
)
db = client[settings["MONGODB_DB"]]
self.coll = db[settings["MONGODB_COLLECTION"]]
def process_item(self, item, spider):
self.coll.insert(dict(item))
return item