需求:求出每个User截止当月总的流量
user date traffic
熊猫tv 2018-01-02 5
快手 2018-01-02 3
YY 2018-01-02 2
抖音 2018-01-02 15
熊猫tv 2018-01-03 5
快手 2018-01-03 3
YY 2018-01-03 2
抖音 2018-01-03 15
熊猫tv 2018-02-02 5
快手 2018-02-02 3
YY 2018-02-02 2
抖音 2018-02-02 15
熊猫tv 2018-02-03 5
快手 2018-02-03 3
YY 2018-02-03 2
抖音 2018-02-03 15
熊猫tv 2018-03-02 5
快手 2018-03-02 3
YY 2018-03-02 2
抖音 2018-03-02 15
熊猫tv 2018-03-03 5
快手 2018-03-03 3
YY 2018-03-03 2
抖音 2018-03-03 15
hive创建表:
create table user_traffic(user string,date string,traffic bigint) row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/user_traffic.txt' overwrite into table user_traffic;
先按user,月份分组求出每个月的traffic
select temp.user,temp.yearmonth,sum(temp.traffic) from(select
user,concat(split(date,'-')[0],split(date,'-')[1]) as yearmonth,traffic
from user_traffic) temp group by temp.user,temp.yearmonth;
表自连接
select t1.*,t2.* from (select temp.user,temp.yearmonth,sum(temp.traffic) from
(select user,concat(split(date,'-')[0],split(date,'-')[1]) as yearmonth,traffic from user_traffic)
temp group by temp.user,temp.yearmonth) t1,(select temp.user,temp.yearmonth,sum(temp.traffic) from
(select user,concat(split(date,'-')[0],split(date,'-')[1]) as yearmonth,traffic from user_traffic) temp group by
temp.user,temp.yearmonth) t2 where t1.yearmonth=t2.yearmonth and t1.user = t2.user and t1.yearmonth >= t2.yearmonth;
select t.user,t.yearmonth,sum(t.total) from (select t1.user,t1.yearmonth,t1.total from (select temp.user,temp.yearmonth,sum(temp.traffic) as total from (select user,concat(split(date,'-')[0],split(date,'-')[1]) as yearmonth,traffic from user_traffic) temp group by temp.user,temp.yearmonth) t1,(select temp.user,temp.yearmonth,sum(temp.traffic) as total from (select user,concat(split(date,'-')[0],split(date,'-')[1]) as yearmonth,traffic from user_traffic) temp group by temp.user,temp.yearmonth) t2 where t1.user = t2.user and t1.yearmonth >= t2.yearmonth) t group by t.user,t.yearmonth;