数据科学和人工智能技术笔记十九、数据整理（5）

十九、数据整理（5）

作者：Chris Albon

译者：飞龙

协议：CC BY-NC-SA 4.0

规范化一列

# 导入所需模块
import pandas as pd
from sklearn import preprocessing

# 设置图表为内联
%matplotlib inline

# 创建示例数据帧，带有未规范化的一列
data = {'score': [234,24,14,27,-74,46,73,-18,59,160]}
df = pd.DataFrame(data)
df

	score
0	234
1	24
2	14
3	27
4	-74
5	46
6	73
7	-18
8	59
9	160

# 查看为未规范化的数据
df['score'].plot(kind='bar')

# <matplotlib.axes._subplots.AxesSubplot at 0x11b9c88d0>

png

# 创建 x，其中 x 的得分列的值为浮点数
x = df[['score']].values.astype(float)

# 创建 minmax 处理器对象
min_max_scaler = preprocessing.MinMaxScaler()

# 创建一个对象，转换数据，拟合 minmax 处理器
x_scaled = min_max_scaler.fit_transform(x)

# 在数据帧上运行规范化器
df_normalized = pd.DataFrame(x_scaled)

# 查看数据帧
df_normalized

	0
0	1.000000
1	0.318182
2	0.285714
3	0.327922
4	0.000000
5	0.389610
6	0.477273
7	0.181818
8	0.431818
9	0.759740

# 绘制数据帧
df_normalized.plot(kind='bar')

# <matplotlib.axes._subplots.AxesSubplot at 0x11ba31c50>

png

Pandas 中的级联表

# 导入模块
import pandas as pd

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'TestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'TestScore'])
df

	regiment	company	TestScore
0	Nighthawks	1st	4
1	Nighthawks	1st	24
2	Nighthawks	2nd	31
3	Nighthawks	2nd	2
4	Dragoons	1st	3
5	Dragoons	1st	4
6	Dragoons	2nd	24
7	Dragoons	2nd	31
8	Scouts	1st	2
9	Scouts	1st	3
10	Scouts	2nd	2
11	Scouts	2nd	3

# 按公司和团队创建分组均值的透视表
pd.pivot_table(df, index=['regiment','company'], aggfunc='mean')

		TestScore
regiment	company
Dragoons	1st	3.5
	2nd	27.5
Nighthawks	1st	14.0
	2nd	16.5
Scouts	1st	2.5
	2nd	2.5

# 按公司和团队创建分组计数的透视表
df.pivot_table(index=['regiment','company'], aggfunc='count')

		TestScore
regiment	company
Dragoons	1st	2
	2nd	2
Nighthawks	1st	2
	2nd	2
Scouts	1st	2
	2nd	2

在 Pandas 中快速修改字符串列

我经常需要或想要改变一串字符串中所有项目的大小写（例如BRAZIL到Brazil等）。有很多方法可以实现这一目标，但我已经确定这是最容易和最快的方法。

# 导入 pandas
import pandas as pd

# 创建名称的列表
first_names = pd.Series(['Steve Murrey', 'Jane Fonda', 'Sara McGully', 'Mary Jane'])

# 打印列
first_names

'''
0    Steve Murrey
1      Jane Fonda
2    Sara McGully
3       Mary Jane
dtype: object 
'''

# 打印列的小写
first_names.str.lower()

'''
0    steve murrey
1      jane fonda
2    sara mcgully
3       mary jane
dtype: object 
'''

# 打印列的大写
first_names.str.upper()

'''
0    STEVE MURREY
1      JANE FONDA
2    SARA MCGULLY
3       MARY JANE
dtype: object 
'''

# 打印列的标题大小写
first_names.str.title()

'''
0    Steve Murrey
1      Jane Fonda
2    Sara Mcgully
3       Mary Jane
dtype: object 
'''

# 打印以空格分割的列
first_names.str.split(" ")

'''
0    [Steve, Murrey]
1      [Jane, Fonda]
2    [Sara, McGully]
3       [Mary, Jane]
dtype: object 
'''

# 打印首字母大写的列
first_names.str.capitalize()

'''
0    Steve murrey
1      Jane fonda
2    Sara mcgully
3       Mary jane
dtype: object 
'''

明白了吧。更多字符串方法在这里。

随机抽样数据帧

# 导入模块
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

	first_name	last_name	age	preTestScore	postTestScore
0	Jason	Miller	42	4	25
1	Molly	Jacobson	52	24	94
2	Tina	Ali	36	31	57
3	Jake	Milner	24	2	62
4	Amy	Cooze	73	3	70

# 不放回选择大小为 2 的随机子集
df.take(np.random.permutation(len(df))[:2])

	first_name	last_name	age	preTestScore	postTestScore
1	Molly	Jacobson	52	24	94
4	Amy	Cooze	73	3	70

对数据帧的行排名

# 导入模块
import pandas as pd

# 创建数据帧
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df

	coverage	name	reports	year
Cochice	25	Jason	4	2012
Pima	94	Molly	24	2012
Santa Cruz	57	Tina	31	2013
Maricopa	62	Jake	2	2014
Yuma	70	Amy	3	2014

5 rows × 4 columns

# 创建一个新列，该列是 coverage 值的升序排名
df['coverageRanked'] = df['coverage'].rank(ascending=1)
df

	coverage	name	reports	year	coverageRanked
Cochice	25	Jason	4	2012	1
Pima	94	Molly	24	2012	5
Santa Cruz	57	Tina	31	2013	2
Maricopa	62	Jake	2	2014	3
Yuma	70	Amy	3	2014	4

5 rows × 5 columns

正则表达式基础

# 导入正则包
import re

import sys

text = 'The quick brown fox jumped over the lazy black bear.'

three_letter_word = '\w{3}'

pattern_re = re.compile(three_letter_word); pattern_re

re.compile(r'\w{3}', re.UNICODE) 

re_search = re.search('..own', text)

if re_search:
    # 打印搜索结果
    print(re_search.group())

# brown

re.match

re.match()仅用于匹配字符串的开头或整个字符串。对于其他任何内容，请使用re.search。

Match all three letter words in text

# 在文本中匹配所有三个字母的单词
re_match = re.match('..own', text)

if re_match:
    # 打印所有匹配
    print(re_match.group())
else:
    # 打印这个
    print('No matches')

# No matches

re.split

# 使用 'e' 作为分隔符拆分字符串。
re_split = re.split('e', text); re_split

# ['Th', ' quick brown fox jump', 'd ov', 'r th', ' lazy black b', 'ar.']

re.sub

用其他东西替换正则表达式模式串。3表示要进行的最大替换次数。

# 用 'E' 替换前三个 'e' 实例，然后打印出来
re_sub = re.sub('e', 'E', text, 3); print(re_sub)

# ThE quick brown fox jumpEd ovEr the lazy black bear.

正则表达式示例

# 导入 regex
import re

# 创建一些数据
text = 'A flock of 120 quick brown foxes jumped over 30 lazy brown, bears.'

re.findall('^A', text)

# ['A'] 

re.findall('bears.$', text)

# ['bears.'] 

re.findall('f..es', text)

# ['foxes'] 

# 寻找所有元音
re.findall('[aeiou]', text)

# ['o', 'o', 'u', 'i', 'o', 'o', 'e', 'u', 'e', 'o', 'e', 'a', 'o', 'e', 'a'] 

# 查找不是小写元音的所有字符
re.findall('[^aeiou]', text)

'''
['A',
 ' ',
 'f',
 'l',
 'c',
 'k',
 ' ',
 'f',
 ' ',
 '1',
 '2',
 '0',
 ' ',
 'q',
 'c',
 'k',
 ' ',
 'b',
 'r',
 'w',
 'n',
 ' ',
 'f',
 'x',
 's',
 ' ',
 'j',
 'm',
 'p',
 'd',
 ' ',
 'v',
 'r',
 ' ',
 '3',
 '0',
 ' ',
 'l',
 'z',
 'y',
 ' ',
 'b',
 'r',
 'w',
 'n',
 ',',
 ' ',
 'b',
 'r',
 's',
 '.'] 
'''

re.findall('a|A', text)

# ['A', 'a', 'a'] 

# 寻找任何 'fox' 的实例
re.findall('(foxes)', text)

# ['foxes'] 

# 寻找所有五个字母的单词
re.findall('\w\w\w\w\w', text)

# ['flock', 'quick', 'brown', 'foxes', 'jumpe', 'brown', 'bears'] 

re.findall('\W\W', text)

# [', '] 

re.findall('\s', text)

# [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '] 

re.findall('\S\S', text)

'''
['fl',
 'oc',
 'of',
 '12',
 'qu',
 'ic',
 'br',
 'ow',
 'fo',
 'xe',
 'ju',
 'mp',
 'ed',
 'ov',
 'er',
 '30',
 'la',
 'zy',
 'br',
 'ow',
 'n,',
 'be',
 'ar',
 's.'] 
'''

re.findall('\d\d\d', text)

# ['120'] 

re.findall('\D\D\D\D\D', text)

'''
['A flo',
 'ck of',
 ' quic',
 'k bro',
 'wn fo',
 'xes j',
 'umped',
 ' over',
 ' lazy',
 ' brow',
 'n, be'] 
'''

re.findall('\AA', text)

# ['A'] 

re.findall('bears.\Z', text)

# ['bears.'] 

re.findall('\b[foxes]', text)

# [] 

re.findall('\n', text)

# [] 

re.findall('[Ff]oxes', 'foxes Foxes Doxes')

# ['foxes', 'Foxes'] 

re.findall('[Ff]oxes', 'foxes Foxes Doxes')

# ['foxes', 'Foxes'] 

re.findall('[a-z]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', 'o', 'x', 'e', 's'] 

re.findall('[A-Z]', 'foxes Foxes')

# ['F'] 

re.findall('[a-zA-Z0-9]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', 'F', 'o', 'x', 'e', 's'] 

re.findall('[^aeiou]', 'foxes Foxes')

# ['f', 'x', 's', ' ', 'F', 'x', 's'] 

re.findall('[^0-9]', 'foxes Foxes')

# ['f', 'o', 'x', 'e', 's', ' ', 'F', 'o', 'x', 'e', 's'] 

re.findall('foxes?', 'foxes Foxes')

# ['foxes'] 

re.findall('ox*', 'foxes Foxes')

# ['ox', 'ox'] 

re.findall('ox+', 'foxes Foxes')

# ['ox', 'ox'] 

re.findall('\d{3}', text)

# ['120'] 

re.findall('\d{2,}', text)

# ['120', '30'] 

re.findall('\d{2,3}', text)

# ['120', '30'] 

re.findall('^A', text)

# ['A'] 

re.findall('bears.$', text)

# ['bears.'] 

re.findall('\AA', text)

# ['A'] 

re.findall('bears.\Z', text)

# ['bears.'] 

re.findall('bears(?=.)', text)

# ['bears'] 

re.findall('foxes(?!!)', 'foxes foxes!')

# ['foxes'] 

re.findall('foxes|foxes!', 'foxes foxes!')

# ['foxes', 'foxes'] 

re.findall('fox(es!)', 'foxes foxes!')

# ['es!'] 

re.findall('foxes(!)', 'foxes foxes!')

# ['!']

重索引序列和数据帧

# 导入模块
import pandas as pd
import numpy as np

# 创建亚利桑那州南部的火灾风险序列
brushFireRisk = pd.Series([34, 23, 12, 23], index = ['Bisbee', 'Douglas', 'Sierra Vista', 'Tombstone'])
brushFireRisk

'''
Bisbee          34
Douglas         23
Sierra Vista    12
Tombstone       23
dtype: int64 
'''

# 重索引这个序列并创建一个新的序列变量
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'])
brushFireRiskReindexed

'''
Tombstone       23.0
Douglas         23.0
Bisbee          34.0
Sierra Vista    12.0
Barley           NaN
Tucson           NaN
dtype: float64 
'''

# 重索引序列并在任何缺失的索引处填入 0
brushFireRiskReindexed = brushFireRisk.reindex(['Tombstone', 'Douglas', 'Bisbee', 'Sierra Vista', 'Barley', 'Tucson'], fill_value = 0)
brushFireRiskReindexed

'''
Tombstone       23
Douglas         23
Bisbee          34
Sierra Vista    12
Barley           0
Tucson           0
dtype: int64 
'''

# 创建数据帧
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df

	county	reports	year
0	Cochice	4	2012
1	Pima	24	2012
2	Santa Cruz	31	2013
3	Maricopa	2	2014
4	Yuma	3	2014

# 更改行的顺序（索引）
df.reindex([4, 3, 2, 1, 0])

	county	reports	year
4	Yuma	3	2014
3	Maricopa	2	2014
2	Santa Cruz	31	2013
1	Pima	24	2012
0	Cochice	4	2012

# 更改列的顺序（索引）
columnsTitles = ['year', 'reports', 'county']
df.reindex(columns=columnsTitles)

	year	reports	county
0	2012	4	Cochice
1	2012	24	Pima
2	2013	31	Santa Cruz
3	2014	2	Maricopa
4	2014	3	Yuma

重命名列标题

来自 StackOverflow 上的 rgalbo。

# 导入所需模块
import pandas as pd

# 创建列表的字典，作为值
raw_data = {'0': ['first_name', 'Molly', 'Tina', 'Jake', 'Amy'], 
        '1': ['last_name', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        '2': ['age', 52, 36, 24, 73], 
        '3': ['preTestScore', 24, 31, 2, 3]}

# 创建数据帧
df = pd.DataFrame(raw_data)

# 查看数据帧
df

	0	1	2	3
0	first_name	last_name	age	preTestScore
1	Molly	Jacobson	52	24
2	Tina	Ali	36	31
3	Jake	Milner	24	2
4	Amy	Cooze	73	3

# 从数据集的第一行创建一个名为 header 的新变量
header = df.iloc[0]

'''
0      first_name
1       last_name
2             age
3    preTestScore
Name: 0, dtype: object 
'''

# 将数据帧替换为不包含第一行的新数据帧
df = df[1:]

# 使用标题变量重命名数据帧的列值
df.rename(columns = header)

	first_name	last_name	age	preTestScore
1	Molly	Jacobson	52	24
---	---	---	---	---
2	Tina	Ali	36	31
---	---	---	---	---
3	Jake	Milner	24	2
---	---	---	---	---
4	Amy	Cooze	73	3
---	---	---	---	---

重命名多个数据帧的列名

# 导入模块
import pandas as pd

# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)

# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)

# 创建示例数据帧
data = {'Commander': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'Date': ['2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08', '2012, 02, 08'], 
        'Score': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df

	Commander	Date	Score
Cochice	Jason	2012, 02, 08	4
Pima	Molly	2012, 02, 08	24
Santa Cruz	Tina	2012, 02, 08	31
Maricopa	Jake	2012, 02, 08	2
Yuma	Amy	2012, 02, 08	3

# 重命名列名
df.columns = ['Leader', 'Time', 'Score']

df

	Leader	Time	Score
Cochice	Jason	2012, 02, 08	4
Pima	Molly	2012, 02, 08	24
Santa Cruz	Tina	2012, 02, 08	31
Maricopa	Jake	2012, 02, 08	2
Yuma	Amy	2012, 02, 08	3

df.rename(columns={'Leader': 'Commander'}, inplace=True)

df

	Commander	Time	Score
Cochice	Jason	2012, 02, 08	4
Pima	Molly	2012, 02, 08	24
Santa Cruz	Tina	2012, 02, 08	31
Maricopa	Jake	2012, 02, 08	2
Yuma	Amy	2012, 02, 08	3

替换值

# 导入模块
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [-999, -999, -999, 2, 1],
        'postTestScore': [2, 2, -999, 2, -999]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

	first_name	last_name	age	preTestScore	postTestScore
0	Jason	Miller	42	-999	2
1	Molly	Jacobson	52	-999	2
2	Tina	Ali	36	-999	-999
3	Jake	Milner	24	2	2
4	Amy	Cooze	73	1	-999

# 将所有 -999 替换为 NAN
df.replace(-999, np.nan)

	first_name	last_name	age	preTestScore	postTestScore
0	Jason	Miller	42	NaN	2.0
1	Molly	Jacobson	52	NaN	2.0
2	Tina	Ali	36	NaN	NaN
3	Jake	Milner	24	2.0	2.0
4	Amy	Cooze	73	1.0	NaN

将数据帧保存为 CSV

# 导入模块
import pandas as pd

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

	first_name	last_name	age	preTestScore	postTestScore
0	Jason	Miller	42	4	25
1	Molly	Jacobson	52	24	94
2	Tina	Ali	36	31	57
3	Jake	Milner	24	2	62
4	Amy	Cooze	73	3	70

将名为df的数据帧保存为 csv。

df.to_csv('example.csv')

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 206,482评论 6赞 481
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 88,377评论 2赞 382
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 152,762评论 0赞 342
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 55,273评论 1赞 279
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 64,289评论 5赞 373
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 49,046评论 1赞 285
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,351评论 3赞 400
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,988评论 0赞 259
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 43,476评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,948评论 2赞 324
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,064评论 1赞 333
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,712评论 4赞 323
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,261评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,264评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,486评论 1赞 262
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,511评论 2赞 354
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,802评论 2赞 345

数据科学和人工智能技术笔记 十九、数据整理（5）