爬取思路
1.分析页面,定义爬取字段
2.观察网页,分析接口url,通过xpath
和json
解析爬取内容字段
3.在pipelines.py
写入存储方式
4.开始爬取
5.GitHub地址:https://github.com/HAOyanWEI24/Crawler/tree/master/jingdongspider 欢迎fork和star
1.分析网页,定义字段
通过观察页面,我将字段分为了两块:一块为商品详情,包括价格名称,评论数量等等内容,另一块主要从商品得到评论,会员的相关信息,定义如下:
1.商品详情:
- link = scrapy.Field()
商品链接
- project_id = scrapy.Field()
商品ID
- name = scrapy.Field()
商品名字
- comment_num = scrapy.Field()
评论人数
- shop_name = scrapy.Field()
店家名字
- price = scrapy.Field()
价钱
- GoodCountStr = scrapy.Field()
好评
- AfterCount = scrapy.Field()
中评
- PoorCount = scrapy.Field()
差评
2.评论详情:
- user_name = scrapy.Field()
评论用户的名字
- user_id = scrapy.Field()
评论用户的ID
- userProvince = scrapy.Field()
评论用户来自的地区
- content = scrapy.Field()
评论内容
- good_id = scrapy.Field()
评论的商品ID
- good_name = scrapy.Field()
评论的商品名字
- date = scrapy.Field()
评论时间
- replyCount = scrapy.Field()
回复数
- score = scrapy.Field()
评分
- status = scrapy.Field()
状态
- userLevelId = scrapy.Field()
用户等级
- productColor = scrapy.Field()
商品颜色
- productSize = scrapy.Field()
商品大小
- userLevelName = scrapy.Field()
银牌会员,钻石会员等
- userClientShow = scrapy.Field()
来自什么 比如来自京东客户端
- isMobile = scrapy.Field()
是否来自手机
- days = scrapy.Field()
天数
接口思路解析:
京东网页中的很多数据是写在js中的,需要在network中查找接口路由,从而获得其真正所在的url地址,通过不同的id与接口组合得到不同的解析内容,分析如下:
分析接口:
京东价格js接口url: https://p.3.cn/prices/mgets?callback=jQuery8876824&skuIds=J_4471753
京东评论数量js接口url: https://club.jd.com/comment/productCommentSummaries.action?referenceIds=4471753
京东评论js接口url: https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv2394&productId=6023682&score=0&sortType=5&page=2&pageSize=10&isShadowSku=0&fold=1
思路阐述,由商品list页面转入商品详情页面,解析商品的详情属性,逻辑很简单,话不多说,直接上代码
"""京东商品详情页代码"""
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import JingdongspiderItem
import scrapy
import re
import json
from scrapy import Request
class JingdongSpider(scrapy.Spider):
name = 'jingdong'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京东"""
url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
item = JingdongspiderItem()
url = url.xpath('@href').extract()
all_url = response.urljoin(url[0])
item['link'] = all_url # 商品链接
for link in url:
url = response.urljoin(link)
yield Request(url, meta={'meta': item}, callback=self.parseDetails)
"""
通过递归原理解析下一页
下一页网页xpath解析地址
"""
next_page = response.xpath('//a[@class="pn-next"]')
for page in next_page:
pages = page.xpath('@href').extract()[0]
page = response.urljoin(pages)
print(">>>>>>>>>>>>>", page)
yield Request(page, callback=self.parseMainPage, dont_filter=True)
def parseDetails(self, response):
item = response.meta['meta']
id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id
item['project_id'] = id
shop_name = response.xpath('//div[@class="name"]/a/text()').extract()[0] # 商店名称
print(">>>>>>",shop_name)
item['shop_name'] = shop_name
item['name'] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].strip() # 名称
"""
获取京东商品价格的url
"""
price_url = "https://p.3.cn/prices/mgets?callback=jQuery8876824&skuIds=" + str(id)
price = requests.get(price_url).text
money = re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
item['price'] = money[0]
"""
获取京东商品评论数量
"""
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
yield scrapy.Request(comment_num, meta={'item': item}, callback=self.parse_getCommentnum)
"""
通过正则表达式解析评论人数
"""
# comment_nums = requests.get(comment_num).text
# nums = re.findall(r'\"ShowCountStr\"\:\"(.*?)\"', comment_nums)
# print(">>>>>>>", nums)
# page = urllib.urlopen(comment_num)
# data = page.read()
# print(data)
def parse_getCommentnum(self, response):
item = response.meta['item']
# response.text是一个json格式的
date = json.loads(response.text)
# print(date)
item['comment_num']= date['CommentsCount'][0]['CommentCountStr'] # 评论数量
item['AfterCount'] = date['CommentsCount'][0]['AfterCount'] # 好评
item['GoodCountStr']= date['CommentsCount'][0]['GoodCountStr'] # 中评
item['PoorCount']= date['CommentsCount'][0]['PoorCount'] # 差评
# for field in item.fields:
# try:
# item[field] = eval(field)
# except:
# print('Field is not defined', field)
yield item
"""京东评论详情页代码"""
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import commentItem
import json
import xlrd
import scrapy
from scrapy import Request
class JingdongCommentSpider(scrapy.Spider):
name = 'comment'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京东"""
url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
url = url.xpath('@href').extract()
for link in url:
url = response.urljoin(link)
yield Request(url, callback=self.parseDetails)
def parseDetails(self, response):
id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id
"""
解析京东商品评论的url
"""
# url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page=0&pageSize=10'
# yield scrapy.Request(url, callback=self.parse_getCommentnum)
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
com = requests.get(comment_num).text
date = json.loads(com)
comment_nums = date['CommentsCount'][0]['ShowCount']
print(comment_nums)
comment_total = int(comment_nums)
if comment_total % 10 == 0: # 算出评论的页数,一页10条评论
page = comment_total//10
else:
page = comment_total//10 + 1
for k in range(page):
'''
京东下一页评论接口
'''
com_url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page='+str(k)+'&pageSize=10'
# print(">>>>>>>>>>", com_url)
yield scrapy.Request(com_url, callback=self.parse_getCommentnum)
# yield scrapy.Request(com_url, callback=self.parseDetails)
def parse_getCommentnum(self, response):
js = json.loads(response.text)
# print(js)
comments = js['comments'] # 该页所有评论
items = []
for comment in comments:
item1 = commentItem()
item1['user_name'] = comment['nickname'] # 用户名
item1['user_id'] = comment['id'] # 用户id
item1['userProvince'] = comment['userProvince'] # 用户评论用户来自的地区
item1['content'] = comment['content'] # 评论
item1['good_id'] = comment['referenceId'] # 评论的商品ID
item1['good_name'] = comment['referenceName'] # 评论的商品名字
item1['date'] = comment['referenceTime'] # 评论时间
item1['replyCount'] = comment['replyCount'] # 回复数
item1['score'] = comment['score'] # 评分
item1['status'] = comment['status'] # 状态
item1['userLevelId'] = comment['userLevelId'] # 用户等级
item1['productColor'] = comment['productColor'] # 商品颜色
item1['productSize'] = comment['productSize'] # 商品大小
item1['userLevelName'] = comment['userLevelName'] # 银牌会员,钻石会员等
item1['isMobile'] = comment['isMobile'] # 是否来自手机
item1['userClientShow'] = comment['userClientShow'] # 是否来自手机
item1['days'] = comment['days'] # 天数
items.append(item1)
return items
存入数据库
"""
pipelines.pyc存储方法
"""
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
SETTINGS = get_project_settings()
class MySQLPipeline(object):
@classmethod
def from_settings(cls, settings):
'''1、@classmethod声明一个类方法,而对于平常我们见到的则叫做实例方法。
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
dbparams = dict(
host=settings['MYSQL_HOST'], # 读取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得
def __init__(self, dbpool):
self.dbpool = dbpool
# pipeline默认调用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
# 写入数据库中
def _conditional_insert(self, tx, item):
sql = "insert into jingdong(project_id,name,comment_num,shop_name,link,GoodCountStr,AfterCount,PoorCount,price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (
item["project_id"], item["name"], item["comment_num"], item["shop_name"], item["link"], item["GoodCountStr"],
item["AfterCount"], item["PoorCount"], item["price"])
tx.execute(sql, params)
# 错误处理方法
def _handle_error(self, failue, item, spider):
print('--------------database operation exception!!-----------------')
print(failue)
settings.py
数据库配置
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'jingdong' #数据库名字,请修改
MYSQL_USER = 'user' #数据库账号,请修改
MYSQL_PASSWD = 'pwd' #数据库密码,请修改
MYSQL_PORT = 3306 #数据库端口,在dbhelper中使用