环境:
win10 ,Contos7.4
python3.6.1
pycharm2017
retrying=1.3.3
requests=2.22.0
fake_useragent
抓包获取口碑数据接口:
-
车系口碑数据列表
# 口碑数据接口 ss:车系ID, p:页数, s:一页返回数据个数最多50 https://*****.com.cn/autov9.1.0/alibi/seriinos-ss3170-st0-p112-s50-isstruct0.json
-
口碑详细数据接口
# 口碑详细数据接口 eid=3052096 口碑详情页ID https://*****.com.cn/autov9.1.0/alibi/NeEaltionInfo.ashx?eid=
第一步 获取所有车型数据:
def get_model(self, url):
"""获取所有车型数据"""
response = self._parse_url(url)
content = response.content.decode('GBK') # GBK解码
# 剔除开头和结尾处多余字符 转换为json
content = content.replace('var listCompare$100= ', '').replace(';', '')
content = json.loads(content)
for i in content:
for q in i['List']:
# 车系ID
yield q['I']
第二步 获取所有车型数据:
def get_eid(self, url, car):
"""获取车型口碑ID列表"""
log_init().info(f'车系:{car} 口碑数据获取中...')
p = 1
while True:
try:
response = self._parse_url(url).json()
except:
return
koubeis = response.get('result').get('list')
if not koubeis:
log_init().info(f'车系:{car}车型口碑ID列表获取完成。')
return
eids = [i['Koubeiid'] for i in koubeis]
for eid in eids:
yield eid
p += 1
第三步 解析口碑详情数据:
def get_content(self, cars, eid):
"""解析口碑详情数据"""
url = f'{self.NewEvaluationUrl}{eid}'
log_init().info(f'{url} 数据获取中...')
response = self._parse_url(url).json()
result = response.get('result')
if not result:
log_init().info(f'{eid}无数据!')
return
specid = result.get('specid') # 车型ID
userId = result.get('userId') # 用户ID
userName = result.get('userName') # 用户姓名
specname = result.get('specname') # 购买车型
boughtprovincename = result.get('boughtprovincename') # 购买地点
dealername = result.get('dealername') # 购买经销商
boughtdate = result.get('boughtdate') # 购买时间
boughtPrice = result.get('boughtPrice') # 裸车购买价
actualOilConsumption = result.get('actualOilConsumption') # 油耗
drivekilometer = result.get('drivekilometer') # 目前行驶
spaceScene = result.get('spaceScene').get('score') # 空间
powerScene = result.get('powerScene').get('score') # 动力
maneuverabilityScene = result.get('maneuverabilityScene').get('score') # 操控
oilScene = result.get('oilScene').get('score') # 油耗
comfortablenessScene = result.get('comfortablenessScene').get('score') # 舒适性
apperanceScene = result.get('apperanceScene').get('score') # 外观
internalScene = result.get('internalScene').get('score') # 内饰
costefficientScene = result.get('costefficientScene').get('score') # 性价比
purpose = ','.join([i['purposename'] for i in result.get('purpose')]) # 购车目的
brandname = result.get('brandname') # 品牌名称
seriesname = result.get('seriesname') # 车系名称
boughtcityname = result.get('boughtcityname') # 车系名称
data = [[userId, userName, cars, specid, brandname, seriesname, specname, boughtprovincename, boughtcityname, dealername,
boughtdate, boughtPrice, actualOilConsumption, drivekilometer, spaceScene, powerScene,
maneuverabilityScene, oilScene, comfortablenessScene, apperanceScene, internalScene,
costefficientScene, purpose]]
return data
第四部 多线程启动:
@run_time
def main(self, num):
"""程序入口"""
# 多线程启动
pool = Pool(num)
for car in self.get_model():
# 判断是否获取
if self.keep_records(str(car), vali=True):
log_init().info(f'{car} 已获取跳过!')
continue
# 启动线程
pool.apply_async(self.run, (car,))
pool.close()
pool.join()
运行结果:
本文仅供学习交流使用,如侵立删!
企鹅 、WX: 1033383881