- 创建工程
scrapy startproject tutorial
class InputMysqlItem(scrapy.Item):
tags = scrapy.Field()
content = scrapy.Field()
- 编写创建数据库
创建数据库:
CREATE DATABASE db DEFAULT CHARACTER SET utf8
创建需要的表:CREATE TABLE crawl_my( id INTEGER NOT NULL tags VARCHAR(64) NULL, content VARCHAR(255) NULL) ENGINE = InnoDB;
import pymysql.cursors
class MySQLPipeline(object):
def __init__(self):
#链接数据库
self.connect = pymysql.connect(
host = '127.0.0.1',#数据库地址
port = 3306,#数据库端口
db = 'db',#数据库名称
user = 'root',#数据库用户名
passwd = 'root',#数据库密码
charset = 'utf8',#数据库编码
use_unicode = True
)
#拿到操作数据库的游标
self.cursor = self.connect.cursor()
def process_item(self,item,spider):
self.cursor.execute(
'''
insert into crawl_my(tags,content)
VALUE (%s,%s)
''',(item['tags'],item['content'])
)
#提交sql
self.connect.commit()
return item
ITEM_PIPELINES = {
# 'tutorial.pipelines.FlowPipline': 300,
# 'tutorial.pipelines.MyImagesPipeline': 1,
'tutorial.pipelines.MySQLPipeline': 1,
# 'scrapy.contrib.pipeline.images.ImagesPipeline':1,
}
import scrapy
from tutorial.items import InputMysqlItem
class CrawlMysqlSpider(scrapy.Spider):
name = 'crawl_mysql'
allowed_domains = ['lab.scrapyd.cn']
start_urls = ['http://lab.scrapyd.cn/']
def parse(self, response):
item = InputMysqlItem()
for sel in response.css('div.quote'):
item['content'] = sel.css('.text::text').extract_first()
tags = sel.css('.tags .tag::text').extract()
item['tags'] = ','.join(tags)
yield item
#获取下一页链接
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page,callback=self.parse)