创建项目
scrapy startproject youyaoqi
进入文件夹内建立爬虫
scrapy genspider yaoqi u17.com
yaoqi.py文件中
# -*- coding: utf-8 -*-
import scrapy
import json
from youyaoqi.items import YouyaoqiItem
class YaoqiSpider(scrapy.Spider):
name = 'yaoqi'
allowed_domains = ['u17.com']
start_urls = ['http://www.u17.com/comic_list/th99_gr99_ca99_ss99_ob0_ac0_as0_wm0_co99_ct99_p1.html?order=2']
def start_requests(self):
data = {'data[group_id]': 'no', 'data[theme_id]': 'no', 'data[is_vip]': 'no', 'data[accredit]': 'no',
'data[color]': 'no',
'data[comic_type]': 'no',
'data[series_status]': 'no',
'data[order]': '2',
'data[page_num]': '1',
'data[read_mode]': 'no',
}
url = 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list'
headers = {
'Referer': url,
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
for i in range(409):
data['data[page_num]'] = '%d' % (i+1)
yield scrapy.FormRequest(
url=url,
headers=headers,
method='POST',
formdata=data,
callback=self.parse,
)
def parse(self, response):
json_result = json.loads(response.text)
comic_list = json_result['comic_list']
for comic in comic_list:
item = YouyaoqiItem()
item['comic_id'] = comic['comic_id']
item['name'] = comic['name']
item['cover'] = comic['cover']
item['category'] = comic['line2']
yield item
在items.py文件中
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YouyaoqiItem(scrapy.Item):
# define the fields for your item here like:
comic_id = scrapy.Field()
name = scrapy.Field()
cover = scrapy.Field()
category = scrapy.Field()
在settings.py文件中
把ROBOTSTXT_OBEY = True改为False
配置中文
FEED_EXPORT_ENCODING = 'utf-8'
执行命令
scrapy crawl yaoqi reslut.json
把数据保存进reslut.json文件中
保存数据进数据库---mysql
在settings.py文件中配置mysql
MYSQL_HOST = '127.0.0.1'
MYSQL_DATABASE = 'u17'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
在pipelines.py文件中
class U17MysqlPipeline(object):
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get("MYSQL_HOST"),
database=crawler.settings.get("MYSQL_DATABASE"),
user=crawler.settings.get("MYSQL_USER"),
password=crawler.settings.get("MYSQL_PASSWORD"),
port=crawler.settings.get("MYSQL_PORT"),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
sql = 'insert into yaoqi (comic_id, name, cover, category) values ("%s", "%s", "%s", "%s")' % (
item['comic_id'], item['name'], item['cover'], item['category'])
print(sql)
self.cursor.execute(sql)
self.db.commit()
return item
在settings.py文件中配置
ITEM_PIPELINES = {
'youyaoqi.pipelines.U17MysqlPipeline': 300,]
保存图片
在settings.py文件中配置图片路径
IMAGES_STORE = './images'
在pipelines.py文件中
class U17ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['cover'])
数据存进mongodb
在pipelines.py文件中
class YaoqiMongoPipeline(object):
def __init__(self, database):
self.db = database
@classmethod
def from_crawler(cls, crawler):
return cls(
database=crawler.settings.get("MONGO_DB"),
)
def open_spider(self, spider):
connect(self.db)
def close_spider(self, spider):
pass
def process_item(self, item, spider):
comic = Comic()
comic.comic_id = item['comic_id']
comic.name = item['name']
comic.cover = item['cover']
comic.category = item['category']
comic.save()
return item
在settings.py文件中
ITEM_PIPELINES = {
# 'youyaoqi.pipelines.U17MysqlPipeline': 300,
# 'youyaoqi.pipelines.U17ImagePipeline': 310,
'youyaoqi.pipelines.YaoqiMongoPipeline': 320,
}
配置mongo路径
MONGO_DB = 'yaoqi'