一、项目背景
Adventure Works Cycle是国内一家生产和销售自行车及和相关配件的制造公司。利用每日商品销售及相关客户信息数据,获取商品销售趋势、地域分布情况和用户画像,帮助运营人员自主和实时分析。
二、数据处理
1.数据结构
2.需求确认
- 1.全年自行车整体销售表现(2020年)
- 2.上月自行车地域销售表现(2020年12月)
- 3.上月自行车产品销售表现
- 4.用户行为分析
- 5.上月热品销售分析
3.数据处理
##########1.导包,连接数据库读取数据######
#导包
import pandas as pd
import numpy as np
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
#连接数据库,读取数据
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order=pd.read_sql_query('select * from dw_customer_order',con=engine)
#查看时间地区产品聚合表的情况
gather_customer_order.head()
输出为:######2.执行第一个需求,对全年自行车销售整体情况进行数据处理以及分析######
#将create_date按月新增一列
gather_customer_order['year_month']=gather_customer_order['create_date'].apply(lambda x:x.strftime('%Y-%m'))
gather_customer_order.head()
输出为:#查看时间产品地区聚合表的简要信息
gather_customer_order.info()
输出为:#
#查看产品类别有哪几类
gather_customer_order.cplb_zw.unique()
输出为:#筛选产品类别为自行车的信息
gather_customer_order_cycle=gather_customer_order[gather_customer_order['cplb_zw']=='自行车']
gather_customer_order_cycle.head()
输出为:#根据月份查看自行车的销售数量与销售额
pt_overall_sale_performance=gather_customer_order_cycle.groupby('year_month').agg({'order_num':sum,'sum_amount':sum}).reset_index()
#查看整体销售情况表的基本数据
pt_overall_sale_performance
输出为:##添加两列,分别是销售量环比和销售额环比
pt_overall_sale_performance['order_num_diff']=pt_overall_sale_performance.iloc[:,1:2].pct_change()
pt_overall_sale_performance['sum_amount_diff']=pt_overall_sale_performance.iloc[:,2:3].pct_change()
#替换na值为0
pt_overall_sale_performance_1_wt=pt_overall_sale_performance.fillna(0)
pt_overall_sale_performance_1_wt
输出为:#将表存到数据库
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
pt_overall_sale_performance_1_wt.to_sql('pt_overall_sale_performance_1_wt',con=engine,if_exists='replace',index=False)
######3.执行第二项需求数据处理。上月自行车地域销售表现#####
#筛选出2020年11月和12月的数据,按地域和时间进行分组
cycle_territory_11_12=gather_customer_order_cycle[gather_customer_order_cycle['year_month'].isin(['2020-11','2020-12'])]
cycle_territory_11_12_group=cycle_territory_11_12.groupby(['chinese_territory','year_month']).agg({'order_num':sum,'sum_amount':sum}).reset_index()
cycle_territory_11_12_group
输出为:#计算每个地区的销售量环比和销售额环比
territory=list(cycle_territory_11_12_group['chinese_territory'].unique())
order_num_diff=pd.Series([])
sum_amount_diff=pd.Series([])
for i in territory:
a=cycle_territory_11_12_group.loc[cycle_territory_11_12_group['chinese_territory']==i]['order_num'].pct_change().fillna(0)
b=cycle_territory_11_12_group.loc[cycle_territory_11_12_group['chinese_territory']==i]['sum_amount'].pct_change().fillna(0)
order_num_diff=order_num_diff.append(a)
sum_amount_diff=sum_amount_diff.append(b)
cycle_territory_11_12_group['order_num_diff']=order_num_diff
cycle_territory_11_12_group['sum_amount_diff']=sum_amount_diff
cycle_territory_11_12_group
输出为:###将地域情况表存入数据库
engine=create_engine('mysql://frogdata05:Frogdata!1321@106.15.121.232:3306/datafrog05_adventure?charset=gbk')
cycle_territory_11_12_group.to_sql('pt_bicy_territory_2_wt',con=engine,if_exists='replace',index=False)
##查看2020年12月销量排名前十的城市
city_top_10=gather_customer_order_cycle[gather_customer_order_cycle['year_month']=='2020-12'].groupby('chinese_city').agg({'order_num':sum}).sort_values(by='order_num',ascending=False).reset_index()
city_top_10=city_top_10.iloc[0:10,:]
city_top_10
输出为:#查找销量前十城市11.12月的销售情况
city_top_10_order=cycle_territory_11_12[cycle_territory_11_12['chinese_city'].isin(city_top_10['chinese_city'])]
#按城市、日期分组
city_top_10_order_group=city_top_10_order.groupby(['chinese_city','year_month']).agg({'order_num':sum,'sum_amount':sum}).reset_index()
#计算按成绩计算销售量环比和销售额环比
city=list(city_top_10_order_group['chinese_city'].unique())
order_num_diff=pd.Series([])
sum_amount_diff=pd.Series([])
for i in city:
a=city_top_10_order_group.loc[city_top_10_order_group['chinese_city']==i]['order_num'].pct_change().fillna(0)
b=city_top_10_order_group.loc[city_top_10_order_group['chinese_city']==i]['sum_amount'].pct_change().fillna(0)
order_num_diff=order_num_diff.append(a)
sum_amount_diff=sum_amount_diff.append(b)
city_top_10_order_group['order_num_diff']=order_num_diff
city_top_10_order_group['sum_amount_diff']=sum_amount_diff
city_top_10_order_group
输出为:##将销量城市top10环比表存入数据库
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
city_top_10_order_group.to_sql('pt_bicy_city_3_wt.to_sql',con=engine,if_exists='replace',index=False)
######4.执行第三项需求数据处理。上月自行车产品销售表现#####
#求每个月自行车累计销量
gather_customer_order_group_month=gather_customer_order_cycle.groupby('year_month').order_num.sum().reset_index()
#将累计销量与自行车销售信息表连接
order_num_proportion=pd.merge(gather_customer_order_cycle,gather_customer_order_group_month,on='year_month')
#计算自行车每月销量占比(每辆自行车当月占比情况,便于可视化输出)
order_num_proportion['order_proportion']=order_num_proportion['order_num_x']/order_num_proportion['order_num_y']
#重命名
order_num_proportion=order_num_proportion.rename(columns={'order_num_y':'sum_month_order'})
输出为:#将每月自行车销售信息存入数据库
engine=create_engine('mysql://root:******@1127.0.01:3306/adventure?charset=gbk')
order_num_proportion.to_sql('pt_bicycle_product_sales_month_4_wt',con=engine,if_exists='replace',index=False)
#####查看不同产品自行车表现,对数据进行整理####
#查看自行车有哪些产品子类
gather_customer_order_cycle['cpzl_zw'].unique()
输出为:####分别查看自行车各子类产品销售表现####
##########公路自行车###########
gather_customer_order_road=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='公路自行车']
#查看公路自行车每月各产品销售情况
gather_customer_order_road_month=gather_customer_order_road.groupby(['year_month','product_name']).order_num.sum().reset_index()
gather_customer_order_road_month['cpzl_zw']='公路自行车'
#查看公路自行车每月的总销量
gather_customer_order_road_month_sum=gather_customer_order_road.groupby('year_month').order_num.sum().reset_index()
#将两表合并
gather_customer_order_road_month=pd.merge(gather_customer_order_road_month,gather_customer_order_road_month_sum,on='year_month')
#########山地自行车#########
gather_customer_order_mountain=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='山地自行车']
##查看山地自行车每月每种产品销售情况
gather_customer_order_mountain_month=gather_customer_order_mountain.groupby(['year_month','product_name']).order_num.sum().reset_index()
gather_customer_order_mountain_month['cpzl_zw']='山地自行车'
#查看山地自行车每月总销售量
gather_customer_order_mountain_month_sum=gather_customer_order_mountain.groupby('year_month').order_num.sum().reset_index()
#合并两表
gather_customer_order_mountain_month=pd.merge(gather_customer_order_mountain_month,gather_customer_order_mountain_month_sum,on='year_month')
#########旅游自行车##########
gather_customer_order_tour=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='旅游自行车']
#查看旅游自行车每月每种产品销售量
gather_customer_order_tour_month=gather_customer_order_tour.groupby(['year_month','product_name']).order_num.sum().reset_index()
#查看旅游自行车每月总销售量
gather_customer_order_tour_month_sum=gather_customer_order_tour.groupby('year_month').order_num.sum().reset_index()
#合并两表
gather_customer_order_tour_month=pd.merge(gather_customer_order_tour_month,gather_customer_order_tour_month_sum,on='year_month')
gather_customer_order_tour_month['cpzl_zw']='旅游自行车'
#将公路、山地、旅游自行车每月销售信息表合并
gather_customer_order_month=pd.concat([gather_customer_order_mountain_month,gather_customer_order_road_month,gather_customer_order_tour_month])
gather_customer_order_month
输出为:gather_customer_order_month['order_num_proportion']=gather_customer_order_month['order_num_x']/gather_customer_order_month['order_num_y']
#重命名
gather_customer_order_month=gather_customer_order_month.rename(columns={'order_num_y':'sum_order_month','order_num_x':'order_month_product'})
gather_customer_order_month.head()
输出为:#存入数据库
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order_month.to_sql('pt_bicycle_product_sales_order_month_4_wt',con=engine,if_exists='append',index=False)
##计算当月自行车产品环比
#筛选出11,12月数据
gather_customer_order_month_11_12=gather_customer_order_month[gather_customer_order_month['year_month'].isin(['2020-11','2020-12'])]
#按产品类别和日期排序
gather_customer_order_month_11_12=gather_customer_order_month_11_12.sort_values(by=['product_name','year_month'],ascending=True)
#计算自行车销量环比
product_name_list=list(gather_customer_order_month_11_12.product_name.unique())
order_top_x=pd.Series([])
for i in product_name_list:
a=gather_customer_order_month_11_12.loc[gather_customer_order_month_11_12['product_name']==i]['order_month_product'].pct_change().fillna(0)
order_top_x=order_top_x.append(a)
gather_customer_order_month_11_12['order_num_diff']=order_top_x
#筛选出12月自行车数据
gather_customer_order_month_12=gather_customer_order_month_11_12[gather_customer_order_month_11_12['year_month']=='2020-12']
#计算2020年全年累计销量
gather_customer_order_month_sum=gather_customer_order_month.groupby('product_name').order_month_product.sum().reset_index()
gather_customer_order_month_sum=gather_customer_order_month_sum.rename(columns={'order_month_product':'sum_order_2020'})
gather_customer_order_month_sum
#关联累计销量表与12月自行车环比表
gather_customer_order_month_12=pd.merge(gather_customer_order_month_12,gather_customer_order_month_sum,on='product_name')
gather_customer_order_month_12
输出为:#将表保存到数据库
engine=create_engine('mysql://root:F******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order_month_12.to_sql('pt_bicycle_product_sales_order_month_12_5_wt',con=engine,if_exists='append',index=False)
######5.执行第四项需求进行用户行为数据处理######
###读取数据
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure_ods?charset=gbk')
df_customer=pd.read_sql_query("select customer_key,birth_date,gender,marital_status from ods_customer where create_date<='2020-12-31'",con=engine)
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure_ods?charset=gbk')
df_sales_orders = pd.read_sql_query("select * from ods_sales_orders where create_date>='2020-12-1' and create_date<='2020-12-31'",con = engine)
#查看数据情况
df_customer.head()
df_sales_orders.head()
输出为:#将两表连接
sales_customer_order=pd.merge(df_customer,df_sales_orders,on='customer_key',how='inner')
sales_customer_order.head()
输出:#提取出生年份
customer_birth_year = sales_customer_order.birth_date.str.split('-',expand=True).rename(columns={0:'birth_year'}).drop(labels=[1,2],axis=1)
#合并
sales_customer_order=pd.concat([sales_customer_order,customer_birth_year],axis=1)
#修改出生年为int数据类型
sales_customer_order['birth_year'] = sales_customer_order['birth_year'].astype('int')
#计算用户年龄
sales_customer_order['customer_age'] = 2021 - sales_customer_order['birth_year']
#用户年龄分层
#年龄分层1
age_level=pd.cut(sales_customer_order['customer_age'],[30,35,40,45,50,55,60,65],labels=['30-34','35-39','40-44','45-49','50-54','55-59','60-64'],right=False)
#新增'age_level'分层区间列
sales_customer_order['age_level'] =age_level
sales_customer_order.head()
输出为:#筛选销售订单为自行车的订单信息
df_customer_order_bycle = sales_customer_order.loc[sales_customer_order['cplb_zw'] == '自行车']
# 计算年龄比率(可用于各类占比计算)
df_customer_order_bycle['age_level_rate']=1/df_customer_order_bycle.customer_age.count()
#将年龄分为3个层次
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] <= 29),'age_level2'] = '<=29'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 30) & (df_customer_order_bycle['customer_age'] < 40),'age_level2'] = '30-39'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 40),'age_level2'] = '>=40'
df_customer_order_bycle.head()
输出为:
# 求每个年龄段人数
age_level2_count = df_customer_order_bycle.groupby(by = 'age_level2').sales_order_key.count().reset_index()
age_level2_count
输出为:#用户性别占比
gender_count = df_customer_order_bycle.groupby(by = 'gender').cplb_zw.count().reset_index()
gender_count
#合并年龄段表,计算占比
df_customer_order_bycle = pd.merge(df_customer_order_bycle,age_level2_count,on = 'age_level2').rename(columns = {'sales_order_key_y':'age_level2_count'})
df_customer_order_bycle['age_level2_rate'] = 1/df_customer_order_bycle['age_level2_count']
#合并性别表,计算占比
df_customer_order_bycle = pd.merge(df_customer_order_bycle,gender_count,on = 'gender').rename(columns = {'cplb_zw_y':'gender_count'})
df_customer_order_bycle['gender_rate'] = 1/df_customer_order_bycle['gender_count']
df_customer_order_bycle.head()
输出为:#存入数据库
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
df_customer_order_bycle.to_sql('pt_user_behavior_20_wt',con = engine,if_exists='append', index=False)
######6.执行第五项需求,热品销售分析######
#筛选12月数据
gather_customer_order_12 = gather_customer_order_cycle.loc[gather_customer_order_cycle['year_month'] == '2020-12']
gather_customer_order_12.head()
#计算产品销售数量
#按照销量降序,取TOP10产品
customer_order_12_top10 = gather_customer_order_12.groupby(by = 'product_name').order_num.count().reset_index().\
sort_values(by = 'order_num',ascending = False).head(10)
customer_order_12_top10
输出为:#筛选top10产品的销量与环比
#销量top10
customer_order_month_11_12 = gather_customer_order_month_11_12[['year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
customer_order_month_11_12 = customer_order_month_11_12[customer_order_month_11_12['product_name'].\
isin(list(customer_order_12_top10['product_name']))]
customer_order_month_11_12['category'] = '本月TOP10销量'
customer_order_month_12 = gather_customer_order_month_11_12.loc[gather_customer_order_month_11_12['year_month'] == '2020-12'].\
sort_values(by = 'order_num_diff',ascending = False).head(10)
customer_order_month_11_12 = gather_customer_order_month_11_12[['year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
#增速top10
customer_order_month_12_top10_seep = customer_order_month_11_12.loc[customer_order_month_11_12['product_name'].\
isin(list(customer_order_month_12['product_name']))]
customer_order_month_12_top10_seep['category'] = '本月TOP10增速'
#合并两表
hot_products_12 = pd.concat([customer_order_month_11_12,customer_order_month_12_top10_seep],axis = 0)
hot_products_12
输出:
#存入数据库
engine = create_engine('mysql://frogdata05:Frogdata!1321@106.15.121.232:3306/datafrog05_adventure?charset=gbk')
datafrog=engine
hot_products_12.to_sql('pt_hot_products_wt',con = datafrog,if_exists='append', index=False)