1、数据读取
import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
print(type(nba_2013))
print(nba_2013.dtypes)
print(nba_2013.head()) #默认显示前五行
print(nba_2013.tail()) #默认显示后五行
print(nba_2013.columns)
print(nba_2013.shape)
print(type(nba_2013))的输出:
<class 'pandas.core.frame.DataFrame'>
2、索引与计算
import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
print(nba_2013.loc[0]) #索引为0的数据
print(nba_2013.loc[3:6]) #索引为3、4、5、6的数据
print(nba_2013['player'])
print(nba_2013[['player','age']])
col_names = nba_2013.columns.tolist() #把变量名返回为列表形式
print(col_names)
end_point_columns = []
for i in col_names:
if i.endswith('.'):
end_point_columns.append(i) #把变量名为‘.’结尾的加入end_point_columns
end_point = nba_2013[end_point_columns]
print(end_point)
添加变量:
a=nba_2013['age']*10
print(a)
print(nba_2013.shape)
nba_2013['age*10']=a
print(nba_2013)
3、数据排序
import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
nba_2013.sort_values('age',inplace=True) #按年龄大小进行排序
print(nba_2013['age'])
nba_2013.sort_values('age',inplace=True,ascending=False) #从大到小排
print(nba_2013['age'])
输出:
16 19
168 19
355 20
115 20
422 20
129 20
186 20
......
156 37
8 38
226 38
148 39
325 39
Name: age, Length: 481, dtype: int64
325 39
148 39
226 38
8 38
156 37
......
249 20
355 20
115 20
168 19
16 19
Name: age, Length: 481, dtype: int64
4、泰坦尼克数据简单处理
(1)首先导入数据,查看所有特征
import pandas
import numpy
pandas.set_option('display.max_columns',None) #显示数据的所有列
# pandas.set_option('display.max_rows',None) #显示数据的所有行
titanic = pandas.read_csv('titanic.csv')
print(titanic.columns)
输出:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
(2)查找Age的缺失值,并计数
age = titanic['Age']
print(age.loc[0:10])
age_is_null = pandas.isnull(age)
print(age_is_null)
age_is_NAN = age[age_is_null]
print(age_is_NAN)
print(len(age_is_NAN)) #缺失值117
输出:
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
5 True
6 False
......
884 False
885 False
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
......
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
(3)含有缺失值时无法直接计算均值等,要进行处理
mean_age = sum(titanic['Age'])/len(titanic['Age'])
print(mean_age)
good_age = age[age_is_null==False]
correct_mean_age = sum(good_age)/len(good_age)
print(correct_mean_age)
输出:
nan
29.69911764705882
(4)透视图
# 方法一:计算不同仓位的价格的平均数
fare = titanic.pivot_table(index='Pclass',values='Fare',aggfunc=numpy.mean)
print(fare)
#不同仓位获救人数的平均数
passenager_survival = titanic.pivot_table(index='Pclass',values='Survived',aggfunc=numpy.mean)
print(passenager_survival)
#不同仓位的平均年龄
passenager_age = titanic.pivot_table(index='Pclass',values='Age',aggfunc=numpy.mean)
print(passenager_age)
输出:
Fare
Pclass
1 84.154687
2 20.662183
3 13.675550
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
(5)去掉缺失值
drop_na_columns = titanic.dropna(axis=1) #axis=1丢掉列 默认丢掉行
new_titanic = titanic.dropna(axis=0,subset=['Age','Sex'])
row_index_5_age = titanic.loc[5,'Age'] #5是没有值的
print(row_index_5_age) #现在打印5上的age
输出:
nan
(6)排序
new_titanic = titanic.sort_values('Age',ascending=False)
print(new_titanic[0:10])
titanic_reindex = new_titanic.reset_index(drop=True) #drop=true 以前的索引值不要了,重新排
print('----------')
print(titanic_reindex.loc[0:10])
输出:
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
----------
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
10 457 0 1 Millet, Mr. Francis Davis male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
10 65.0 0 0 13509 26.5500 E38 S
Process finished with exit code 0
(7)自定义函数
A、第一百个人的数据
def hundredth_row(data):
hundredth_item = data.loc[99]
return hundredth_item
hundredth_row = titanic.apply(hundredth_row)
print(hundredth_row)
输出:
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
B、计算各特征的缺失值
def not_null_count(data):
data_null = pandas.isnull(data)
null = data[data_null]
return len(null)
data_null_count = titanic.apply(not_null_count)
print(data_null_count)
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
C、把数值型变量改成定性变量,并由此做透视图
def age_to_lei(data):
age = data['Age']
if pandas.isnull(age)==True:
return 'unknow'
elif age>18:
return 'adult'
else:
return 'child'
new_age = titanic.apply(age_to_lei,axis=1)
print(new_age)
#不同年龄段的存活率
titanic['new_age']=new_age
newage_survived = pandas.pivot_table(titanic,index='new_age',values='Survived')
print(newage_survived)
输出:
0 adult
1 adult
2 adult
3 adult
4 adult
......
885 adult
886 adult
887 adult
888 unknow
889 adult
890 adult
Length: 891, dtype: object
new_age Survived
adult 0.382609
child 0.503597
unknow 0.293785
5、series
import pandas as pd
import numpy as np
T10yr = pd.read_csv('T10yr.csv')
print(T10yr.columns)
serise_Date = T10yr['Date']
print(serise_Date[0:5])
print(type(serise_Date))
serise_High = T10yr['High']
print(serise_High[0:5])
输出:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')
0 2000-01-03
1 2000-01-04
2 2000-01-05
3 2000-01-06
4 2000-01-07
Name: Date, dtype: object
<class 'pandas.core.series.Series'>
0 6.603
1 6.548
2 6.599
3 6.585
4 6.595
Name: High, dtype: float64
对两个series进行处理,一个作为索引,一个作为值
from pandas import Series
serise_Date_value = serise_Date.values
print(type(serise_Date_value))
serise_High_value = serise_High.values
serise_High_Date = Series(index=serise_Date_value,data=serise_High_value)
print(serise_High_Date[['2000-01-03','2000-01-05']])
print(serise_High_Date['2000-01-03':'2000-01-07'])
print('-------------------')
print(serise_High_Date[0:5])
输出:
<class 'numpy.ndarray'>
2000-01-03 6.603
2000-01-05 6.599
dtype: float64
2000-01-03 6.603
2000-01-04 6.548
2000-01-05 6.599
2000-01-06 6.585
2000-01-07 6.595
dtype: float64
2000-01-03 6.603
2000-01-04 6.548
2000-01-05 6.599
2000-01-06 6.585
2000-01-07 6.595
dtype: float64
.apply可以调函数用
high_low = T10yr[['Low','High']]
high_low.apply(lambda x:np.std,axis=1)
print(high_low)
输出:
Low High
0 6.498 6.603
1 6.485 6.548
2 6.508 6.599
3 6.540 6.585
......
4163 1.549 1.587
4164 1.511 1.570
4165 1.493 1.535
4166 1.458 1.530
[4167 rows x 2 columns]