十九、数据整理(5)
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
规范化一列
# 导入所需模块
import pandas as pd
from sklearn import preprocessing
# 设置图表为内联
%matplotlib inline
# 创建示例数据帧,带有未规范化的一列
data = {'score': [234,24,14,27,-74,46,73,-18,59,160]}
df = pd.DataFrame(data)
df
|
score |
0 |
234 |
1 |
24 |
2 |
14 |
3 |
27 |
4 |
-74 |
5 |
46 |
6 |
73 |
7 |
-18 |
8 |
59 |
9 |
160 |
# 查看为未规范化的数据
df['score'].plot(kind='bar')
# <matplotlib.axes._subplots.AxesSubplot at 0x11b9c88d0>
# 创建 x,其中 x 的得分列的值为浮点数
x = df[['score']].values.astype(float)
# 创建 minmax 处理器对象
min_max_scaler = preprocessing.MinMaxScaler()
# 创建一个对象,转换数据,拟合 minmax 处理器
x_scaled = min_max_scaler.fit_transform(x)
# 在数据帧上运行规范化器
df_normalized = pd.DataFrame(x_scaled)
# 查看数据帧
df_normalized
|
0 |
0 |
1.000000 |
1 |
0.318182 |
2 |
0.285714 |
3 |
0.327922 |
4 |
0.000000 |
5 |
0.389610 |
6 |
0.477273 |
7 |
0.181818 |
8 |
0.431818 |
9 |
0.759740 |
# 绘制数据帧
df_normalized.plot(kind='bar')
# <matplotlib.axes._subplots.AxesSubplot at 0x11ba31c50>
Pandas 中的级联表
# 导入模块
import pandas as pd
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
'TestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'TestScore'])
df
|
regiment |
company |
TestScore |
0 |
Nighthawks |
1st |
4 |
1 |
Nighthawks |
1st |
24 |
2 |
Nighthawks |
2nd |
31 |
3 |
Nighthawks |
2nd |
2 |
4 |
Dragoons |
1st |
3 |
5 |
Dragoons |
1st |
4 |
6 |
Dragoons |
2nd |
24 |
7 |
Dragoons |
2nd |
31 |
8 |
Scouts |
1st |
2 |
9 |
Scouts |
1st |
3 |
10 |
Scouts |
2nd |
2 |
11 |
Scouts |
2nd |
3 |
# 按公司和团队创建分组均值的透视表
pd.pivot_table(df, index=['regiment','company'], aggfunc='mean')
|
|
TestScore |
regiment |
company |
|
Dragoons |
1st |
3.5 |
|
2nd |
27.5 |
Nighthawks |
1st |
14.0 |
|
2nd |
16.5 |
Scouts |
1st |
2.5 |
|
2nd |
2.5 |
# 按公司和团队创建分组计数的透视表
df.pivot_table(index=['regiment','company'], aggfunc='count')
|
|
TestScore |
regiment |
company |
|
Dragoons |
1st |
2 |
|
2nd |
2 |
Nighthawks |
1st |
2 |
|
2nd |
2 |
Scouts |
1st |
2 |
|
2nd |
2 |
在 Pandas 中快速修改字符串列
我经常需要或想要改变一串字符串中所有项目的大小写(例如BRAZIL
到Brazil
等)。 有很多方法可以实现这一目标,但我已经确定这是最容易和最快的方法。
# 导入 pandas
import pandas as pd
# 创建名称的列表
first_names = pd.Series(['Steve Murrey', 'Jane Fonda', 'Sara McGully', 'Mary Jane'])
# 打印列
first_names
'''
0 Steve Murrey
1 Jane Fonda
2 Sara McGully
3 Mary Jane
dtype: object
'''
# 打印列的小写
first_names.str.lower()
'''
0 steve murrey
1 jane fonda
2 sara mcgully
3 mary jane
dtype: object
'''
# 打印列的大写
first_names.str.upper()
'''
0 STEVE MURREY
1 JANE FONDA
2 SARA MCGULLY
3 MARY JANE
dtype: object
'''
# 打印列的标题大小写
first_names.str.title()
'''
0 Steve Murrey
1 Jane Fonda
2 Sara Mcgully
3 Mary Jane
dtype: object
'''
# 打印以空格分割的列
first_names.str.split(" ")
'''
0 [Steve, Murrey]
1 [Jane, Fonda]
2 [Sara, McGully]
3 [Mary, Jane]
dtype: object
'''
# 打印首字母大写的列
first_names.str.capitalize()
'''
0 Steve murrey
1 Jane fonda
2 Sara mcgully
3 Mary jane
dtype: object
'''
明白了吧。更多字符串方法在这里。
随机抽样数据帧
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 不放回选择大小为 2 的随机子集
df.take(np.random.permutation(len(df))[:2])
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
对数据帧的行排名
# 导入模块
import pandas as pd
# 创建数据帧
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3],
'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
coverage |
name |
reports |
year |
Cochice |
25 |
Jason |
4 |
2012 |
Pima |
94 |
Molly |
24 |
2012 |
Santa Cruz |
57 |
Tina |
31 |
2013 |
Maricopa |
62 |
Jake |
2 |
2014 |
Yuma |
70 |
Amy |
3 |
2014 |
5 rows × 4 columns
# 创建一个新列,该列是 coverage 值的升序排名
df['coverageRanked'] = df['coverage'].rank(ascending=1)
df
|
coverage |
name |
reports |
year |
coverageRanked |
Cochice |
25 |
Jason |
4 |
2012 |
1 |
Pima |
94 |
Molly |
24 |
2012 |
5 |
Santa Cruz |
57 |
Tina |
31 |
2013 |
2 |
Maricopa |
62 |
Jake |
2 |
2014 |
3 |
Yuma |
70 |
Amy |
3 |
2014 |
4 |
5 rows × 5 columns
正则表达式基础
# 导入正则包
import re
import sys
text = 'The quick brown fox jumped over the lazy black bear.'
three_letter_word = '\w{3}'
pattern_re = re.compile(three_letter_word); pattern_re
re.compile(r'\w{3}', re.UNICODE)
re_search = re.search('..own', text)
if re_search:
# 打印搜索结果
print(re_search.group())
# brown
re.match
re.match()
仅用于匹配字符串的开头或整个字符串。对于其他任何内容,请使用re.search
。
Match all three letter words in text
# 在文本中匹配所有三个字母的单词
re_match = re.match('..own', text)
if re_match:
# 打印所有匹配
print(re_match.group())
else:
# 打印这个
print('No matches')
# No matches
re.split
# 使用 'e' 作为分隔符拆分字符串。
re_split = re.split('e', text); re_split
# ['Th', ' quick brown fox jump', 'd ov', 'r th', ' lazy black b', 'ar.']
re.sub
用其他东西替换正则表达式模式串。3
表示要进行的最大替换次数。
# 用 'E' 替换前三个 'e' 实例,然后打印出来
re_sub = re.sub('e', 'E', text, 3); print(re_sub)
# ThE quick brown fox jumpEd ovEr the lazy black bear.
正则表达式示例
# 导入 regex
import re
# 创建一些数据
text = 'A flock of 120 quick brown foxes jumped over 30 lazy brown, bears.'
re.findall('^A', text)
# ['A']
re.findall('bears.$', text)
# ['bears.']
re.findall('f..es', text)
# ['foxes']
# 寻找所有元音
re.findall('[aeiou]', text)
# ['o', 'o', 'u', 'i', 'o', 'o', 'e', 'u', 'e', 'o', 'e', 'a', 'o', 'e', 'a']
# 查找不是小写元音的所有字符
re.findall('[^aeiou]', text)
'''
['A',
' ',
'f',
'l',
'c',
'k',
' ',
'f',
' ',
'1',
'2',
'0',
' ',
'q',
'c',
'k',
' ',
'b',
'r',
'w',
'n',
' ',
'f',
'x',
's',
' ',
'j',
'm',
'p',
'd',
' ',
'v',
'r',
' ',
'3',
'0',
' ',
'l',
'z',
'y',
' ',
'b',
'r',
'w',
'n',
',',
' ',
'b',
'r',
's',
'.']
'''
re.findall('a|A', text)
# ['A', 'a', 'a']
# 寻找任何 'fox' 的实例
re.findall('(foxes)', text)
# ['foxes']
# 寻找所有五个字母的单词
re.findall('\w\w\w\w\w', text)
# ['flock', 'quick', 'brown', 'foxes', 'jumpe', 'brown', 'bears']
re.findall('\W\W', text)
# [', ']
re.findall('\s', text)
# [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
re.findall('\S\S', text)
'''
['fl',
'oc',
'of',
'12',
'qu',
'ic',
'br',
'ow',
'fo',
'xe',
'ju',
'mp',
'ed',
'ov',
'er',
'30',
'la',
'zy',
'br',
'ow',
'n,',
'be',
'ar',
's.']
'''
re.findall('\d\d\d', text)
# ['120']
re.findall('\D\D\D\D\D', text)
'''
['A flo',
'ck of',
' quic',
'k bro',
'wn fo',
'xes j',
'umped',
' over',
' lazy',
' brow',
'n, be']
'''
re.findall('\AA', text)
# ['A']
re.findall('bears.\Z', text)
# ['bears.']
re.findall('\b[foxes]', text)
# []
re.findall('\n', text)
# []
re.findall('[Ff]oxes', 'foxes Foxes Doxes')
# ['foxes', 'Foxes']
re.findall('[Ff]oxes', 'foxes Foxes Doxes')
# ['foxes', 'Foxes']
re.findall('[a-z]', 'foxes Foxes')
# ['f', 'o', 'x', 'e', 's', 'o', 'x', 'e', 's']
re.findall('[A-Z]', 'foxes Foxes')
# ['F']
re.findall('[a-zA-Z0-9]', 'foxes Foxes')
# ['f', 'o', 'x', 'e', 's', 'F', 'o', 'x', 'e', 's']
re.findall('[^aeiou]', 'foxes Foxes')
# ['f', 'x', 's', ' ', 'F', 'x', 's']
re.findall('[^0-9]', 'foxes Foxes')
# ['f', 'o', 'x', 'e', 's', ' ', 'F', 'o', 'x', 'e', 's']
re.findall('foxes?', 'foxes Foxes')
# ['foxes']
re.findall('ox*', 'foxes Foxes')
# ['ox', 'ox']
re.findall('ox+', 'foxes Foxes')
# ['ox', 'ox']
re.findall('\d{3}', text)
# ['120']
re.findall('\d{2,}', text)
# ['120', '30']
re.findall('\d{2,3}', text)
# ['120', '30']
re.findall('^A', text)
# ['A']
re.findall('bears.$', text)
# ['bears.']
re.findall('\AA', text)
# ['A']
re.findall('bears.\Z', text)
# ['bears.']
re.findall('bears(?=.)', text)
# ['bears']
re.findall('foxes(?!!)', 'foxes foxes!')
# ['foxes']
re.findall('foxes|foxes!', 'foxes foxes!')
# ['foxes', 'foxes']
re.findall('fox(es!)', 'foxes foxes!')
# ['es!']
re.findall('foxes(!)', 'foxes foxes!')
# ['!']
重索引序列和数据帧
# 导入模块
import pandas as pd
import numpy as np
# 创建亚利桑那州南部的火灾风险序列
brushFireRisk = pd.Series([34, 23, 12, 23], index = ['Bisbee', 'Douglas', 'Sierra Vista', 'Tombstone'])
brushFireRisk
'''
Bisbee 34
Douglas 23
Sierra Vista 12
Tombstone 23
dtype: int64
'''
# 重索引这个序列并创建一个新的序列变量
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'])
brushFireRiskReindexed
'''
Tombstone 23.0
Douglas 23.0
Bisbee 34.0
Sierra Vista 12.0
Barley NaN
Tucson NaN
dtype: float64
'''
# 重索引序列并在任何缺失的索引处填入 0
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'], fill_value = 0)
brushFireRiskReindexed
'''
Tombstone 23
Douglas 23
Bisbee 34
Sierra Vista 12
Barley 0
Tucson 0
dtype: int64
'''
# 创建数据帧
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df
|
county |
reports |
year |
0 |
Cochice |
4 |
2012 |
1 |
Pima |
24 |
2012 |
2 |
Santa Cruz |
31 |
2013 |
3 |
Maricopa |
2 |
2014 |
4 |
Yuma |
3 |
2014 |
# 更改行的顺序(索引)
df.reindex([4, 3, 2, 1, 0])
|
county |
reports |
year |
4 |
Yuma |
3 |
2014 |
3 |
Maricopa |
2 |
2014 |
2 |
Santa Cruz |
31 |
2013 |
1 |
Pima |
24 |
2012 |
0 |
Cochice |
4 |
2012 |
# 更改列的顺序(索引)
columnsTitles = ['year', 'reports', 'county']
df.reindex(columns=columnsTitles)
|
year |
reports |
county |
0 |
2012 |
4 |
Cochice |
1 |
2012 |
24 |
Pima |
2 |
2013 |
31 |
Santa Cruz |
3 |
2014 |
2 |
Maricopa |
4 |
2014 |
3 |
Yuma |
重命名列标题
来自 StackOverflow 上的 rgalbo。
# 导入所需模块
import pandas as pd
# 创建列表的字典,作为值
raw_data = {'0': ['first_name', 'Molly', 'Tina', 'Jake', 'Amy'],
'1': ['last_name', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'2': ['age', 52, 36, 24, 73],
'3': ['preTestScore', 24, 31, 2, 3]}
# 创建数据帧
df = pd.DataFrame(raw_data)
# 查看数据帧
df
|
0 |
1 |
2 |
3 |
0 |
first_name |
last_name |
age |
preTestScore |
1 |
Molly |
Jacobson |
52 |
24 |
2 |
Tina |
Ali |
36 |
31 |
3 |
Jake |
Milner |
24 |
2 |
4 |
Amy |
Cooze |
73 |
3 |
# 从数据集的第一行创建一个名为 header 的新变量
header = df.iloc[0]
'''
0 first_name
1 last_name
2 age
3 preTestScore
Name: 0, dtype: object
'''
# 将数据帧替换为不包含第一行的新数据帧
df = df[1:]
# 使用标题变量重命名数据帧的列值
df.rename(columns = header)
|
first_name |
last_name |
age |
preTestScore |
1 |
Molly |
Jacobson |
52 |
24 |
--- |
--- |
--- |
--- |
--- |
2 |
Tina |
Ali |
36 |
31 |
--- |
--- |
--- |
--- |
--- |
3 |
Jake |
Milner |
24 |
2 |
--- |
--- |
--- |
--- |
--- |
4 |
Amy |
Cooze |
73 |
3 |
--- |
--- |
--- |
--- |
--- |
重命名多个数据帧的列名
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)
# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)
# 创建示例数据帧
data = {'Commander': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'Date': ['2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08'],
'Score': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
Commander |
Date |
Score |
Cochice |
Jason |
2012, 02, 08 |
4 |
Pima |
Molly |
2012, 02, 08 |
24 |
Santa Cruz |
Tina |
2012, 02, 08 |
31 |
Maricopa |
Jake |
2012, 02, 08 |
2 |
Yuma |
Amy |
2012, 02, 08 |
3 |
# 重命名列名
df.columns = ['Leader', 'Time', 'Score']
df
|
Leader |
Time |
Score |
Cochice |
Jason |
2012, 02, 08 |
4 |
Pima |
Molly |
2012, 02, 08 |
24 |
Santa Cruz |
Tina |
2012, 02, 08 |
31 |
Maricopa |
Jake |
2012, 02, 08 |
2 |
Yuma |
Amy |
2012, 02, 08 |
3 |
df.rename(columns={'Leader': 'Commander'}, inplace=True)
df
|
Commander |
Time |
Score |
Cochice |
Jason |
2012, 02, 08 |
4 |
Pima |
Molly |
2012, 02, 08 |
24 |
Santa Cruz |
Tina |
2012, 02, 08 |
31 |
Maricopa |
Jake |
2012, 02, 08 |
2 |
Yuma |
Amy |
2012, 02, 08 |
3 |
替换值
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [-999, -999, -999, 2, 1],
'postTestScore': [2, 2, -999, 2, -999]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
-999 |
2 |
1 |
Molly |
Jacobson |
52 |
-999 |
2 |
2 |
Tina |
Ali |
36 |
-999 |
-999 |
3 |
Jake |
Milner |
24 |
2 |
2 |
4 |
Amy |
Cooze |
73 |
1 |
-999 |
# 将所有 -999 替换为 NAN
df.replace(-999, np.nan)
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
NaN |
2.0 |
1 |
Molly |
Jacobson |
52 |
NaN |
2.0 |
2 |
Tina |
Ali |
36 |
NaN |
NaN |
3 |
Jake |
Milner |
24 |
2.0 |
2.0 |
4 |
Amy |
Cooze |
73 |
1.0 |
NaN |
将数据帧保存为 CSV
# 导入模块
import pandas as pd
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
将名为df
的数据帧保存为 csv。
df.to_csv('example.csv')