本章内容
pandas中的分组聚合
一、分组
import pandas as pd
import numpy as np
df=pd.DataFrame({
'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'],
'Year':[2016,2016,2016,2016,2017,2017,2017,2017],
'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999],
'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000]
})
print(df)
# 根据name这一列进行分组
group_by_name=df.groupby('name')
print(type(group_by_name))
# 查看分组
print(group_by_name.groups)
# 分组后的数量
print(group_by_name.count())
# 查看分组的情况
for name,group in group_by_name:
print(name) # 组的名字
print(group)# 组的数据
# 按照某一列分组,将name这一列作为分组的键,对year进行分组
group_by_name = df['Year'].groupby(df['name'])
print(group_by_name.count())
# 按照多列进行分组
group_by_name_year=df.groupby(['name','Year'])
for name,group in group_by_name_year:
print(name)
print(group)
# 可以选择分组
print(group_by_name_year.get_group(('BOSS',2016)))
'''
name Year Salary Bonus
0 BOSS 2016 999999 100000
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
3 Han 2016 3000 5000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
6 Han 2017 3500 3000
7 BOSS 2017 999999 400000
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
{'BOSS': Int64Index([0, 4, 5, 7], dtype='int64'), 'Han': Int64Index([3, 6], dtype='int64'), 'Lilei': Int64Index([1, 2], dtype='int64')}
Year Salary Bonus
name
BOSS 4 4 4
Han 2 2 2
Lilei 2 2 2
BOSS
name Year Salary Bonus
0 BOSS 2016 999999 100000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
Han
name Year Salary Bonus
3 Han 2016 3000 5000
6 Han 2017 3500 3000
Lilei
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
name
BOSS 4
Han 2
Lilei 2
Name: Year, dtype: int64
('BOSS', 2016)
name Year Salary Bonus
0 BOSS 2016 999999 100000
('BOSS', 2017)
name Year Salary Bonus
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
('Han', 2016)
name Year Salary Bonus
3 Han 2016 3000 5000
('Han', 2017)
name Year Salary Bonus
6 Han 2017 3500 3000
('Lilei', 2016)
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
name Year Salary Bonus
0 BOSS 2016 999999 100000
'''
二、聚合
聚合的函数
mean 计算分组平均值
count 分组中非NA值的数量
sum 非NA值的和
median 非NA值的算术中位数
std 标准差
var 方差
min 非NA值的最小值
max 非NA值的最大值
prod 非NA值的积
first 第一个非NA值
last 最后一个非NA值
mad 平均绝对偏差
mode 模
abs 绝对值
sem 平均值的标准误差
skew 样品偏斜度(三阶矩)
kurt 样品峰度(四阶矩)
quantile 样本分位数(百分位上的值)
cumsum 累积总和
cumprod 累积乘积
cummax 累积最大值
cum 累积最小值
示例:
df1=pd.DataFrame({'Data1':np.random.randint(0,10,5),
'Data2':np.random.randint(10,20,5),
'key1':list('aabba'),
'key2':list('xyyxy')})
print(df1)
'''
Data1 Data2 key1 key2
0 5 16 a x
1 5 11 a y
2 9 13 b y
3 4 13 b x
4 3 16 a y
'''
# 按key1分组,进行聚合计算
# 注意:当分组后进行数值计算时,不是数值类的列(即麻烦列)会被清除
print(df1.groupby('key1').sum())
'''
Data1 Data2
key1
a 9 47
b 6 30
'''
# 只算data1
print(df1['Data1'].groupby(df1['key1']).sum())
'''
key1
a 13
b 5
Name: Data1, dtype: int32
'''
print(df1.groupby('key1')['Data1'].sum())
'''
key1
a 14
b 12
Name: Data1, dtype: int32
'''
print(df1.groupby('key1')['Data1'].mean())
'''
key1
a 3.0
b 3.5
Name: Data1, dtype: float64
'''
# 使用agg()函数做聚合运算
print(df1.groupby('key1').agg('sum'))
# 可以同时做多个聚合运算
print(df1.groupby('key1').agg(['sum','mean','std']))
'''
Data1 Data2
key1
a 9 48
b 7 30
Data1 Data2
sum mean std sum mean std
key1
a 9 3.0 4.358899 48 16 3.000000
b 7 3.5 4.949747 30 15 4.242641
'''
# 可自定义函数,传入agg方法中 grouped.agg(func)
def peak_range(df):
"""
返回数值范围
"""
return df.max() - df.min()
print(df1.groupby('key1').agg(peak_range))
'''
Data1 Data2
key1
a 8 6
b 7 6
此次:df为
Data1 Data2 key1 key2
0 8 13 a x
1 0 19 a y
2 7 12 b y
3 0 18 b x
4 1 16 a y
'''
#同时应眵个聚合函数
print(df1.groupby('key1').agg(['mean','std','count',peak range])) #默认列名为函数名
print(df1.groupby('key1').agg(['mean','std','count',('range', peak_range)])) #通过元组提供新的列名
拓展apply()函数
df1=pd.DataFrame({'sex':list('FFMFMMF'),'smoker':list('YNYYNYY'),'age':[21,30,17,37,40,18,26],'weight':[120,100,132,140,94,89,123]})
print(df1)
'''
sex smoker age weight
0 F Y 21 120
1 F N 30 100
2 M Y 17 132
3 F Y 37 140
4 M N 40 94
5 M Y 18 89
6 F Y 26 123
'''
def bin_age(age):
if age >=18:
return 1
else:
return 0
# 抽烟的年龄大于等18的
print(df1['age'].apply(bin_age))
'''
0 1
1 1
2 0
3 1
4 1
5 1
6 1
Name: age, dtype: int64
'''
df1['age'] = df1['age'].apply(bin_age)
print(df1)
'''
sex smoker age weight
0 F Y 1 120
1 F N 1 100
2 M Y 0 132
3 F Y 1 140
4 M N 1 94
5 M Y 1 89
6 F Y 1 123
'''
# 取出抽烟和不抽烟的体重前二
def top(smoker,col,n=5):
return smoker.sort_values(by=col)[-n:]
df1.groupby('smoker').apply(top,col='weight',n=2)
'''
sex smoker age weight
smoker
N 4 M N 1 94
1 F N 1 100
Y 2 M Y 0 132
3 F Y 1 140
'''