这个系列主要是对pandas库的熟悉,这篇笔记是对大鹏老师课程的学习,主要是对去哪儿网数据的爬取,分析城市的热门景点。
1、数据爬取
# 导入工具包
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 获取urls(更具url的规律)
def get_urls(ui,n):
urllsti = []
for i in range(1,n+1):
urllsti.append(ui +str(i))
return urllsti
# 批量获取数据
def get_data(u):
ri = requests.get(u)
soupi = BeautifulSoup(ri.text,'lxml')
infori = soupi.find('ul',class_="list_item clrfix").find_all('li')
datai = []
n=0
for i in infori:
n+=1
dic = {}
dic['lat'] = i['data-lat']
dic['lng'] = i['data-lng']
dic['景点名称'] = i.find('span',class_="cn_tit").text
dic['攻略提到数量'] = i.find('div',class_="strategy_sum").text
dic['点评数量'] = i.find('div',class_="comment_sum").text
dic['景点排名'] = i.find('span',class_="ranking_sum").text
dic['星级'] = i.find('span',class_="total_star").find('span')['style'].split(':')[1]
datai.append(dic)
return datai
# 获取40条北京的景点数据
bj_u = 'https://travel.qunar.com/p-cs299914-beijing-jingdian-1-'
# 页面参数
urls = get_urls(bj_u,4)
# 获取所有url
bj_data = []
for i in urls:
bj_data.extend(get_data(i))
print('成功采集%i条数据' % len(bj_data))
# 采集数据
df = pd.DataFrame(bj_data)
df.head()
2、数据清洗
# 字段类型处理
df.index = df['景点名称']
del df['景点名称']
df['lng'] = df['lng'].astype(np.float)
df['lat'] = df['lat'].astype(np.float)
df['点评数量'] = df['点评数量'].astype(np.int)
df['攻略提到数量'] = df['攻略提到数量'].astype(np.int)
df.head()
# 星级字段处理
df['星级'] = df['星级'].str.replace('%','').astype(np.float)
df.head()
# 景点排名处理
df['景点排名'] = df['景点排名'].str.split('第').str[1]
df['景点排名'].fillna(value = 0,inplace = True)
df.head()
3、数据查看
# 查看点评数量TOP10
dptop10 = df.sort_values(by = '点评数量', ascending=False).iloc[:10]
dptop10['点评数量'].plot(kind='bar',figsize = (10,5),rot=45,grid=True,color='y')
# 攻略提到数量排名TOP10
gltop10 = df.sort_values(by = '攻略提到数量', ascending=False).iloc[:10]
gltop10['攻略提到数量'].plot(kind='bar',figsize = (10,5),rot=45,grid=True,color='g')
4、景点筛选机制及评价方法
# 满意度指标
df['满意度'] = df['攻略提到数量']/df['点评数量']
df.head()
# 构建函数实现字段标准化
def nordata(dfi,*cols):
for col in cols:
dfi[col + '_nor'] = (dfi[col] - dfi[col].min())/(dfi[col].max() - dfi[col].min())
nordata(df,'满意度','星级','点评数量')
df.head()