原创文章:如有下载及转载请注明来源链接,否则视为侵权
(第一)------------------数据读取相关方法--------------
import numpy as py
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn import RandomForestclassifier
loandData=pd.read_csv("")
Pandas将多个Sheet写入到本地同一Excel文件中
import pandas as pd #读取两个表格data1=pd.read_excel('文件路径')
data2=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系统入账金额异常结果数据.xlsx')#将两个表格输出到一个excel文件里面
data1=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系统入账金额.xlsx')#将两个表格输出到一个excel文件里面
writer=pd.ExcelWriter('D:新表.xlsx')
data1.to_excel(writer,sheet_name='sheet1')
data2.to_excel(writer,sheet_name='sheet2') #必须运行
writer.save()#不然不能输出到本地writer.save()
------Pandas的read_csv读入数据并且自己给列名命名--------
3. 使用Pandas的read_csv、read_fwf、read_table读取数据
import pandas as pd#数据框dataframe
csv_data=pd.read_csv('D:\Python数据分析与数据化运营\python_book\chapter2\csv_data.csv',names=['aa','bb','cc','dd','hh'])#读入数据并且自己给列名命名
print(csv_data)
--------Pandas的pd.read_fwf读入数据并且自己给列名命名----------
import pandas as pd # 导入Pandas库数据框dataframe
fwf_data = pd.read_fwf('D:\Python数据分析与数据化运营\python_book\chapter2\fwf_data', widths=[5, 5, 5, 5], names=['col1', 'col2', 'col3', 'col4']) # 读取csv数据
print (fwf_data) # 打印输出数据
----------Pandas的pd.read_table(txt)读入数据并且自己给列名命名---------
import pandas as pd#数据框dataframe
table_data=pd.read_table('D:\Python数据分析与数据化运营\python_book\chapter2\table_data.txt',sep=';',
names=['aa','ab','ac','ad','ah'])
print(table_data)
------------numpy的读入(txt)---------------------
import numpy as np#array数据组形式
file_name='D:\Python数据分析与数据化运营\python_book\chapter2\numpy_data.txt'
data=np.loadtxt(file_name,dtype='float32',delimiter=' ')
print(data)
------------numpy的读入(npy)----------------
import numpy as np#读入npy格式
write_data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
np.save('D:\Python数据分析与数据化运营\python_book\chapter2\load_data1',write_data)
read_data=np.load('D:\Python数据分析与数据化运营\python_book\chapter2\load_data1.npy')
1. 使用read、readline、readlines读取数据
通过read方式读入text.txt数据
file_name='D:\Python数据分析与数据化运营\python_book\chapter2\text.txt'
file_object=open(file_name)
read_data=file_object.read()
print(read_data)
通过readline方式读入text.txt数据
file_object=open(file_name)
readline_data=file_object.readline()
print(readline_data)
fn = open('D:\Python数据分析与数据化运营\python_book\chapter2\text.txt') # 获得文件对象
print (fn.tell()) # 输出指针位置
line1 = fn.readline() # 获得文件第一行数据
print (line1) # 输出第一行数据
print (fn.tell()) # 输出指针位置
line2 = fn.readline() # 获得文件第二行数据
print (line2) # 输出第二行数据
print (fn.tell()) # 输出指针位置
fn.close() # 关闭文件对象
(第二)------------数据预览相关方法---------------
loandData.head()#查看前5行记录
loandData.info()#查看每个变量结构基本情况
loandData.shape()#查看总样本数及总字段数量
loandData.count()#查看每个变量对应的样本数
loandData.describe()#查看变量最基本指标情况(包括最大值,最小值,方差,第一分位数,中位数数,第三分位数,均值,计数)
df.head(5) #查看DataFrame对象的前n行
df.tail(5) #查看DataFrame对象的最后n行
df.shape #查看数据的行列数各是多少
df.info() #查看索引、数据类型和内存信息
df.describe(percentiles=[.05, .25, .75, .95]) #查看数值型列的汇总统计,返回计数、均值、标准差、最小最大值、25%50%75%分位数,percentiles0.05,0.95分位数
df.unique() #快速查看数据列有哪些分类内容,类似groupby
s.value_counts(dropna=False) #查看Series对象的唯一值和计数
df.apply(pd.Series.value_counts) #查看DataFrame对象中每一列的唯一值和计数
df.sum() #返回所有列的求和值
df.mean() #返回所有列的均值
df.corr() #返回列与列之间的相关系数
df.count() #返回每一列中的非空值的个数
df.max() #返回每一列的最大值
df.min #返回每一列的最小值
df.median() #返回每一列的中位数
df.std() #返回每一列的标准差
df.loc[:3,['incall']]#基于incall列,选择特定行的数据,返回DataFrame格式
df[['incall']]#选择基于incall列,返回DataFrame格式
print(df['incall'])
import pandas as pd
data=pd.read_csv(r'/Users/huangjunwen/Desktop/fandango_score_comparison.csv')
这是我的路径大家使用的时候记得改,记得改,记得改!
print(data.head(10))#查看前10行数据,数据大的时候可以通过该方式快速看到df的数据格式
loanData=loanData[loanData['Status']!='Cancelled']#间接删除某个字段中某一类数据
--增加一列(CreditScore)变量=((最高信用+最低信用)/2).round(0)
loanData['CreditScore']=((loanData.CreditScoreRangeUpper+loanData.CreditScoreRangeLower)/2).round(0)#.round(0)表示取四舍五入
1:----------------------批量数据转换-------------------------------------
def loan_status(s):#该函数主要功能把状态最终分为4种情况包括坏账,取消,贷款还款中,正常还款(正常完成,最后还款中,逾期还款)
if s=='Chargedoff':#Chargedoff(冲销,投资人有损失)
a='Defaulted' #Defaulted(坏账,投资人有损失)
elif s=='Defaulted':#Defaulted(坏账,投资人有损失)
a = 'Defaulted'#Defaulted(坏账,投资人有损失)
elif s=='Cancelled':#Cancelled(取消)
a='Cancelled' #Cancelled(取消)
elif s == 'Current':#Current(贷款还款中)
a = 'Current'#Current(贷款还款中)
else:
a='Completed'#Completed(正常完成,投资人无损失)
return a
loanData['Status']=loanData['LoanStatus'].apply(loan_status)#将数据进行转换,增加一列为Status变量:LoanStatus数据字段转换为Status(来自loan_status函数转换了贷款状态)
------------------------------过滤筛选-----------------------------
DefaultedRatio=loanData[loanData['Status']=='Defaulted']['DebtToIncomeRatio']
CompletedRatio=loanData[loanData['Status']=='Completed']['DebtToIncomeRatio']
2:---------------------批量数据转换--先求四分位数及中位数,再转换-----------------
loanData['BankcardUtilization']的四位数是0.31
oneFourth=loanData['BankcardUtilization'].quantile(0.25)
loanData['BankcardUtilization']的中位数是0.6
twoForth=loanData['BankcardUtilization'].quantile(0.5)
def bank_card_use(s,oneForth = 0.31,twoForth = 0.6):#根据业务经验认为设置两个阀值
if s<=oneForth:
b='Mild Use'
elif (s>oneForth) & (s<=twoForth):
b='Medium Use'
elif (s>twoForth) & (s<=1):
b='Heavy Use'
elif s>1:
b='Super Use'
else:
b='No Use'
return b
loanData['BankCardUse']=loanData['BankcardUtilization'].apply(bank_card_use)
3:---------------------------缺失值处理相关---------------------------
填充缺失值
x['age'].fillna(x['age'].mean(), inplace=True)
2.通常情况下删除行,使用参数axis = 0,删除列的参数axis = 1,通常不会这么做,那样会删除一个变量。
print('\ndrop row')
print(df.dropna(axis = 0))
删除后结果:
缺失值处理
import pandas as pd # 导入pandas库
import numpy as np # 导入numpy库
from sklearn.preprocessing import Imputer # 导入sklearn.preprocessing中的Imputer库
生成缺失数据
df = pd.DataFrame(np.random.randn(6, 4), columns=['col1', 'col2', 'col3', 'col4']) # 生成一份数据
df.iloc[1:2, 1] = np.nan # 增加缺失值
df.iloc[4, 3] = np.nan # 增加缺失值
print (df)
查看哪些值缺失
nan_all = df.isnull() # 获得所有数据框中的N值
print (nan_all) # 打印输出
查看哪些列缺失
nan_col1 = df.isnull().any() # 获得含有NA的列
nan_col2 = df.isnull().all() # 获得全部为NA的列
print (nan_col1) # 打印输出
print (nan_col2) # 打印输出
丢弃缺失值
df2 = df.dropna() # 直接丢弃含有NA的行记录
print (df2) # 打印输出
-------每个字段是否缺失并且展示对应的行数----
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1)
--------用中位数------------补全缺失值
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
使用sklearn将缺失值替换为特定值
nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0) # 建立替换规则:将值为Nan的缺失值以均值做替换
nan_result = nan_model.fit_transform(df) # 应用模型规则
print (nan_result) # 打印输出
使用pandas将缺失值替换为特定值
nan_result_pd1 = df.fillna(method='backfill') # 用后面的值替换缺失值
nan_result_pd2 = df.fillna(method='bfill', limit=1) # 用后面的值替代缺失值,限制每列只能替代一个缺失值
nan_result_pd3 = df.fillna(method='pad') # 用前面的值替换缺失值
nan_result_pd4 = df.fillna(0) # 用0替换缺失值
nan_result_pd5 = df.fillna({'col2': 1.1, 'col4': 1.2}) # 用指定值替换不同列的缺失值
nan_result_pd6 = df.fillna(df.mean()['col2':'col4']) # 用平均数代替,选择各自列的均值替换缺失值
打印输出
print (nan_result_pd1) # 打印输出
print (nan_result_pd2) # 打印输出
print (nan_result_pd3) # 打印输出
print (nan_result_pd4) # 打印输出
print (nan_result_pd5) # 打印输出
print (nan_result_pd6) # 打印输出
主要变量的缺失值,只显示存在缺失的变量,获取其缺失数量,以及缺失率,代码实现如下:
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1) #查出每个变量是否缺失?每个变量总的观测值
column=['是否缺失','数量']#定义两列
missing1=pd.DataFrame(list(missing.values),index=list(missing.index),columns=column)#修正新列命 ‘是否缺失','数量'
max=missing1['数量'].max()#每个变量最大观测值
missing1['缺失数量']=max-missing1['数量']#每个变量最大观测值-每个变量实际观测值=每个变量缺失数量
missing1['缺失率']=missing1['缺失数量']/max#求出缺失值
miss=missing1[missing1['数量']<max] #取出有缺失的样本(包括字段名称,是否有缺失,缺失数量,缺失率)
4.1:######用中位数替换CreditScore的缺失值############
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
用“NOTA” 替换BorrowerState的缺失值
loanData['BorrowerState']=loanData['BorrowerState'].fillna('NOTA')
4.2:############DebtToIncomeRatio 添加随机数缺失值处理#########
DebtToIncomeRatio缺失值添加随机数
def rand_missing(s):
if s>=0:
a=s
else:
a=random.uniform(0.1,0.5)
return a
DebtToIncomeRatio的缺失值添加0.1~0.5的随机变量
loanData['DebtToIncomeRatio']=loanData['DebtToIncomeRatio'].apply(rand_missing)
4.3:##############将DelinquenciesLast7Years的缺失值赋值为1################
loanData['DelinquenciesLast7Years'] = loanData['DelinquenciesLast7Years'].fillna(1)
4.4###########2009之后,选出ProsperRating (Alpha)为空的行,然后对行进行删除#####
missIndex=loanData[(loanData['ProsperRating (Alpha)'].isnull()) & (loanData['DatePhase']=='After Jul.2009')]
loanData=loanData.drop(missIndex.index,axis=0)
4.5#字符串变量转换成数字变量
数据中存在字符串变量,将其用数字变量进行替换。实现的函数如下:
定性变量的赋值
def harmonize_data(df):
# 填充空数据 和 把string数据转成integer表示
#Status
df.loc[df['Status']=='Completed','Status']=1
df.loc[df['Status'] == 'Defaulted', 'Status'] = 0
df.loc[df['Status'] == 'Current', 'Status'] = 2
#IsBorrowerHomeowner
df.loc[df['IsBorrowerHomeowner'] == False, 'IsBorrowerHomeowner'] = 0
df.loc[df['IsBorrowerHomeowner'] == True, 'IsBorrowerHomeowner'] = 1
#CreditGrade
df.loc[df['CreditGrade'] == 'NC', 'CreditGrade'] = 0
df.loc[df['CreditGrade'] == 'HR', 'CreditGrade'] = 1
df.loc[df['CreditGrade'] == 'E', 'CreditGrade'] = 2
df.loc[df['CreditGrade'] == 'D', 'CreditGrade'] = 3
df.loc[df['CreditGrade'] == 'C', 'CreditGrade'] = 4
df.loc[df['CreditGrade'] == 'B', 'CreditGrade'] = 5
df.loc[df['CreditGrade'] == 'A', 'CreditGrade'] = 6
df.loc[df['CreditGrade'] == 'AA', 'CreditGrade'] = 7
#ProsperRating (Alpha)
df.loc[df['ProsperRating (Alpha)'] == 'HR', 'ProsperRating (Alpha)'] = 1
df.loc[df['ProsperRating (Alpha)'] == 'E', 'ProsperRating (Alpha)'] = 2
df.loc[df['ProsperRating (Alpha)'] == 'D', 'ProsperRating (Alpha)'] = 3
df.loc[df['ProsperRating (Alpha)'] == 'C', 'ProsperRating (Alpha)'] = 4
df.loc[df['ProsperRating (Alpha)'] == 'B', 'ProsperRating (Alpha)'] = 5
df.loc[df['ProsperRating (Alpha)'] == 'A', 'ProsperRating (Alpha)'] = 6
df.loc[df['ProsperRating (Alpha)'] == 'AA', 'ProsperRating (Alpha)'] = 7
#IncomeRange
df.loc[df['IncomeRange'] == 'Not displayed', 'IncomeRange'] = 0
df.loc[df['IncomeRange'] == 'Not employed', 'IncomeRange'] = 1
df.loc[df['IncomeRange'] == '1-24,999', 'IncomeRange'] = 3
df.loc[df['IncomeRange'] == '50,000-74,999', 'IncomeRange'] = 5
df.loc[df['IncomeRange'] == '100,000+', 'IncomeRange'] = 7
#BankCardUse
df.loc[df['BankCardUse'] == 'No Use', 'BankCardUse'] = 0
df.loc[df['BankCardUse'] == 'Mild Use', 'BankCardUse'] = 1
df.loc[df['BankCardUse'] == 'Medium Use', 'BankCardUse'] = 2
df.loc[df['BankCardUse'] == 'Heavy Use', 'BankCardUse'] = 3
df.loc[df['BankCardUse'] == 'Super Use', 'BankCardUse'] = 4
#CustomerClarify
df.loc[df['CustomerClarify'] == 'New Borrower', 'CustomerClarify'] = 0
df.loc[df['CustomerClarify'] == 'Previous Borrower', 'CustomerClarify'] = 1
return df
#字符串替换成整数
loanData=harmonize_data(loanData)
loc和iloc的区别
pandas以类似字典的方式来获取某一列的值,比如df[‘A’],这会得到df的A列。如果我们对某一行感兴趣呢?这个时候有两种方法,一种是iloc方法,另一种方法是loc方法。loc是指location的意思,iloc中的i是指integer。这两者的区别如下:
loc:works on labels in the index.
iloc:works on the positions in the index (so it only takes integers).
也就是说loc是根据index来索引,比如下边的df定义了一个index,那么loc就根据这个index来索引对应的行。iloc并不是根据index来索引,而是根据行号来索引,行号从0开始,逐次加1。##
4.6#######################通过循环计算准确率
result=rfr.predict(X_test)
Y_test
对预测的准确率进行计算:
def accuracy_statistics(rd,prd):
count=len(prd)
sum=0
for i in range(1,count):
if rd[i]==prd[i]:
sum += 1
pecent=round(sum/count,4)
return pecent
pecent=accuracy_statistics(list(Y_test.values),list(result))
该模型预测结果的准确率为(1498+4520)/8385=71.77%。
5##########如何安装fbprophet库
https://www.jianshu.com/p/0c06ad7bccaa
1:查看安装了那些包 pip list
2: 查看那些包需要更新版本pip list --outdated
3:创建一个新的python环境:conda create -n fbprophet_python3.7 python=3.7
4:激活已创建新的python环境:conda activate fbprophet_python3.7
如何安装fbprophet库
1、创建一个新的环境按照官网的做法,创建一个新的python环境
conda create -n fbprophet_python3.7 python=3.7
2.激活已创建新的python环境:conda activate fbprophet_python3.7
3.安装C++的编译工具
4.conda install libpython m2w64-toolchain -c msys2
5、安装依赖库
conda install numpy cython -c conda-forge
conda install matplotlib scipy pandas -c conda-forge
6.安装pystan
conda install pystan -c conda-forge
7.安装fbprophet
conda install fbprophet -c conda-forge
8.还是有点问题的。
这样吧在GitHub上,下载一个压缩包,用压缩包安装
https://github.com/facebook/prophet
conda activate fbprophet_python3.7
我又做了一次挣扎,fbprophet也用conda安装
conda install -c conda-forge fbprophet
(第六)##############
python——修改Dataframe列名的两种方法
首先新建一个Dataframe
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
a b
0 1 1
1 2 2
2 3 3
1、修改列名a,b为A、B。
df.columns = ['A','B']
2、只修改列名a为A
df.rename(columns={'a':'A'})
Python中dataframe\ array\ list相互转化
1、list 转化成array矩阵
np.array(result).T
2、array转化成dataframe
pd.DataFrame(result)
3、把Pandas中的dataframe转成numpy中的array
使用 df=df.values,
print(len(X))--查询数组个数
type(X)--查询数据类型
python 切片无循序取其中某列
rr111=rr11[['kelian_x','qingjiatianshu_x','meancall_x','meanlong_x','meanxiapi_x','聚类类别']]
--------python 切片有循序取其中某列
rr111=rr11.iloc[:,0:6]
---python left join
rr11=pd.merge(left=rr1, right=r0, how='left', left_on=rr1.index, right_on=r0.index)# 关联取出类别数目
------------------------------安装分词库jieba-------------------------------------------------------------------------------------------------
1:打开Anaconda Prompt
2: pip install jieba
----------------------------------------本地读入list格式数据-------------------------------------------------------------------------------------
import jieba
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
对句子进行分词
stopwords = stopwordslist('D:\2019年python代码\python_book\chapter4\stopwords.txt') # 这里加载停用词的路径
---------------------------------------在python中如何将两个list合并成一个list----------------------------------------------------------------
1:用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部,例如:
2:用切片(slice)操作,L1[len(L1):len(L1)] = L2和上面的方法等价,例如:
但切片方法用起来更灵活,可以插入到头部,或其他任意部位,例如:
加到开头:
3:加到中间:
-----------------------------------------python数组读入数据-----------------------------------------------------------------------------------
data=np.loadtxt('.txt')
--------------------------------pd.set_option('display.max_rows', 4)#限定只显示四行--------------------------------------------------------------------------
pd.set_option('display.max_rows', 4)#限定只显示四行
-------------------------------------------删除NA缺失数据---------------------------------------------------------------------------------------------------
data=data.dropna()#删除带有NA的所有行与列
print(data.shape)#查看总样本数及总字段数
print(list(data.columns))#查看所有列
查看数据内容:
----------------------------------------------查前面五行数据或者最后五行数据数据------------------------------------------------------------------------------
data1.head(5)#前面五行数据
data1.tail(5)#最后五行数据
--------------------------------------------------------去重unique---------------------------------------------------------------------------------------------
data['education'].unique()#去重
----------------------------------------把data数据中的y列的'yes'值改为1,'no'值改为0----------------------------------------------------------------------------
把y变为数值型,并进行简单的统计。
data.loc[data['y']=='yes','y']=1#把data数据中的y列的'yes'值改为1
data.loc[data['y']=='no','y']=0#把data数据中的y列的'no'值改为0
data['y'].value_counts()#分类对应的个数
----------------------------------------------------画直方图------------------------------------------------------------------------------------------------------
sns.countplot(x='y',data=data,palette='hls')#画直方图
--------------------------------------------------------groupby----------------------------------------------------------------------------------------------------
data.groupby('y').mean()
-----------------------------------------------------取两表差集合(相当去重)-------------------------------------------------------------------
--方法1取两表差集合(相当去重)
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})
#### 1. 数据框内连接,类似inner join
ddd=df1.merge(df2,how='left',left_on='id',right_on='id')#用mergy方法找出两表
ddd1=ddd[ddd.isnull().T.any()][['id','col1']]#取两表差集(相当去重)
--方法2取两表差集合(相当去重)
d1=df1['id'].values.tolist()
d2=df2['id'].values.tolist()
data=[]
for i in d1:
if i in d2:
data=data
else:
data.append(i)
df3=df1[df1['id'].isin(data)]
----------------------------------------------dataframe求均值-------------------------------------------------------------
dataframe按列求均值
df_mean=df[['col1','col2']].mean(axis=0)#数据框dataframe求均值
print(df_mean)
dataframe按行求均值
df_mean=df[['col1','col2']].mean(axis=1)#数据框dataframe求均值
print(df_mean)
-------------------------------------------------dataframe求svd------------------------------------------------------------
dataframe按列求std
df_std=df[['col1','col2']].std(axis=0)#数据框dataframe求均值
print(df_std)
dataframe按行求std
df_std=df[['col1','col2']].std(axis=1)#数据框dataframe求均值
print(df_std)
---------------------------------------------#dataframe求var(标准差)-------------------------------------------------------
dataframe按列求var(标准差)
df_var=df[['col1','col2']].var(axis=0)#数据框dataframe求均值
print(df_var)
dataframe按行求var(标准差)
df_var=df[['col1','col2']].var(axis=1)#数据框dataframe求均值
print(df_var)
--------------------------------------------#dataframe按列求max-----------------------------------------------------------
dataframe按列求max
df_max=df[['col1','col2']].max(axis=0)#数据框dataframe求均值
print(df_max)
dataframe按行求max
df_max=df[['col1','col2']].max(axis=1)#数据框dataframe求均值
print(df_max)
----------------------------------------------#dataframe求min-------------------------------------------------------------
dataframe按列求min
df_min=df[['col1','col2']].min(axis=0)#数据框dataframe求均值
print(df_min)
dataframe按行求min
df_min=df[['col1','col2']].min(axis=1)#数据框dataframe求均值
print(df_min)
-------------------------------------------------#dataframe求sum---------------------------------------------------------
dataframe按列求sum
df_sum=df[['col1','col2']].sum(axis=0)#数据框dataframe求均值
print(df_sum)
dataframe按行求sum
df_sum=df[['col1','col2']].sum(axis=1)#数据框dataframe求均值
print(df_sum)
---------------------------------------------------#dataframe求count------------------------------------------------------------
dataframe按列求count
df_count=df[['col1','col2']].count(axis=0)#数据框dataframe求均值
print(df_count)
dataframe按行求count
df_count=df[['col1','col2']].count(axis=1)#数据框dataframe求均值
print(df_count)
------------------------------------------------#dataframe求中位数--------------------------------------------------------------
df_median=df[['col1','col2']].median(axis=0)#数据框dataframe求均值
print(df_median)
dataframe按行求median
df_median=df[['col1','col2']].median(axis=1)#数据框dataframe求均值
print(df_median)
-------------------------------------------------dataframe按describe求常规统计量,不能用axis=1或者axis=2------------------------------
df_describe=df[['col1','col2']].describe()#数据框dataframe求均值
print(df_describe)
------------------------------------------------#dataframe按corr()求相关系数,不能用axis=1或者axis=2-----------------------------------
df_corr=df.corr()#数据框dataframe求均值
print(df_corr)
----------------------------------------------------dataframe判断重复数据------------------------------------------------------------
重复值处理
import pandas as pd # 导入pandas库
生成重复数据
data1 = ['a', 3]
data2 = ['b', 2]
data3 = ['a', 3]
data4 = ['c', 2]
df = pd.DataFrame([data1, data2, data3, data4], columns=['col1', 'col2'])
print (df)
判断重复数据
isDuplicated = df.duplicated() # 判断重复数据记录
print (isDuplicated) # 打印输出
-------------------------------------------------------dataframe删除重复值------------------------------------------------------------
删除重复值
new_df1 = df.drop_duplicates() # 删除数据记录中所有列值相同的记录
new_df2 = df.drop_duplicates(['col1']) # 删除数据记录中col1值相同的记录
new_df3 = df.drop_duplicates(['col2']) # 删除数据记录中col2值相同的记录
new_df4 = df.drop_duplicates(['col1', 'col2']) # 删除数据记录中指定列(col1/col2)值相同的记录
print (new_df1) # 打印输出
print (new_df2) # 打印输出
print (new_df3) # 打印输出
print (new_df4) # 打印输出
--------------------------------------------------------转义字符--------------------------------------------------------------------
- 转义字符''
转义字符\可以转义很多字符,比如\n表示换行,\t表示制表符,字符\本身也要转义,所以\表示的字符就是\
print ('It 's a dog!')
print ("hello world!\nhello Python!")
print ('\\t\')
原样输出引号内字符串可以使用在引号前加r
print (r'\\t\')
-------------------------------------------------------日期天数相减------------------------------------------------------------------
import datetime
someDay = datetime.date(1999,2,10)
anotherDay = datetime.date(1999,2,15)
deltaDay = anotherDay - someDay
deltaDay.days
----------------------------------------------------------列表(list)简单操作---------------------------------------------------------
列表(list)用来存储一连串元素的容器,列表用[]来表示,其中元素的类型可不相同。
students = ["ming", "hua", "li", "juan", "yun", 3]
print (students)
type(students)
列表索引和切片
索引从0开始,含左不含右
print ('[4]=', students[4])
print ('[-4]=', students[-4])
print ('[0:4]=', students[0:4])
print( '[4:]=', students[4:])
print ('[0:4:2]=', students[0:4:2])
print ('[-5:-1:]=', students[-5:-1:])
print ('[-2::-1]=', students[-2::-1])
修改列表
students[3] = "小月"
print (students[3])
students[5]="小楠"
print (students[5])
students[5]=19978
print (students[5])
插入元素
students.append('han') # 只能添加一个添加到尾部
students.extend(['long', 'wan'])#可以添加多个到尾部
print (students)
scores = [90, 80, 75, 66]
students.insert(6, scores) # 添加到指定左边第一位置,不会去重
students
删除元素
print (students.pop(1)) # 该函数返回被弹出的元素,不传入参数则删除最后一个元素
print (students)
判断元素是否在列表中等
print( 'wan' in students)
print ('han' not in students)
students.count('wan')#计算个数
students.index('wan')#向量在哪个位置
range函数生成整数列表
print (range(10))
print (range(-5, 5))
print (range(-10, 10, 2))
print (range(16, 10, -1))
---------------------------------------------元组(tuple)简单操作--------------------------------------------------------------------
### 3.2.2 元组(tuple)
元组类似列表,元组里面的元素也是进行索引计算。列表里面的元素的值可以修改,而元组里面的元素的值不能修改,只能读取。元组的符号是()。
studentsTuple = ("ming", "jun", "qiang", "wu", scores)
studentsTuple
studentsTuple=['aaa','dddd']
studentsTuple1=('aaa','dddd')
print(studentsTuple)
type(studentsTuple1)
判断异常
try:
studentsTuple[1] = 'fu'
except TypeError:
print ('TypeError')
scores[1]= 100
studentsTuple
'ming' in studentsTuple
studentsTuple[0:4]
studentsTuple.count('ming')
studentsTuple.index('jun')
len(studentsTuple)
-------------------------------------------集合(set)简单操作-------------------------------------------------------------------------
### 3.2.3 集合(set),去重功能
Python中集合主要有两个功能,一个功能是进行集合操作,另一个功能是消除重复元素。 集合的格式是:set(),其中()内可以是列表、字典或字符串,因为字符串是以列表的形式存储的
students = ["ming", "hua", "li", "juan", "yun", 3]
studentsSet = set(students)
print (studentsSet)
studentsSet.add('xu')#集合增加一个字符
print (studentsSet)
studentsSet.remove('xu')#集合减掉一个字符
print (studentsSet)
a = set("abcnmaaaaggsng")
print ('a=', a)
b = set("cdfm")
print ('b=', b)
交集
x = a & b
print( 'x=', x)
并集
y = a | b
print ('y=', y)
差集
z = a - b
print( 'z=', z)
去除重复元素
new = set(a)
print( z)
------------------------------------------------------------字典(dict)简单操作--------------------------------------
### 3.2.4字典(dict)
Python中的字典dict也叫做关联数组,用大括号{}括起来,在其他语言中也称为map,使用键-值(key-value)存储,具有极快的查找速度,其中key不能重复。
k = {"name":"weiwei", "home":"guilin"}
print (k["home"])
print( k.keys())#键
print( k.values())#值
a={"success":True,"reason_code":"200","reason_desc":"获取成功",
"rules":[{"rule_id":"1062274","score":7,"conditions":[{"address_a_value":
"南通市","address_a":"mobile_address","address_b":"true_ip_address","address_b_value":"南京市","type":"match_address"}]}]}
print(a["success"])
添加、修改字典里面的项目
k["like"] = "music"
k['name'] = 'guangzhou'
print (k)
判断key是否存在
print ('name' in k)
has_key方法在python2中是可以使用的,在python3中删除了。
print (k.has_key('name'))
改为:
if 'name' in k:
print("Yes")
k.get('edu', -1) # 通过dict提供的get方法,如果key不存在,可以返回None,或者自己指定的value
删除key-value元素
k.pop('like')
print (k)
--------------------------------------------------------if语句-----------------------------------------------------------------------
### 3.3.2 分支结构:Python中if语句是用来判断选择执行哪个语句块的
=============================================================================
if <True or Flase表达式>:
执行语句块
elif <True or Flase表达式>:
执行语句块
else: # 都不满足
执行语句块
=============================================================================
elif子句可以有多条,elif和else部分可省略
salary = 30000
if salary > 10000:
print ("Wow!!!!!!!")
elif salary > 5000:
print ("That's OK.")
elif salary > 3000:
print ("5555555555")
else:
print ("..........")
-----------------------------------------------for 循环------------------------------------------------------------------------------
- for 循环
=============================================================================
for (条件变量) in (集合):
执行语句块
=============================================================================
“集合”并不单指set,而是“形似”集合的列表、元组、字典、数组都可以进行循环
条件变量可以有多个
heights = {'Yao':226, 'Sharq':216, 'AI':183}
for i in heights:
print (i, heights[i])
for key, value in heights.items():-Python3 不能使用dict.iteritems(),改为dict.items()
for key, value in heights.items():
print(key, value)
total = 0
for i in range(1, 101):
total += i
print (total)
-------------------------------------------------------while 循环简单操作-------------------------------------------------------------
### 3.3.3 循环结构
while 循环
=============================================================================
while <True or Flase表达式>:
循环执行语句块
else: # 不满足条件
执行语句块
=============================================================================
else部分可以省略
a = 1
while a < 10:
if a <= 5:
print (a)
else:
print ("Hello")
a = a + 1
else:
print ("Done")
---------------------------------------------- break、continue和pass-----------------------------------------------------------------
*** 练习:使用循环和分支结构输出20以内的奇数
### 3.3.4 break、continue和pass
break:跳出循环
continue:跳出当前循环
pass:占位符,什么也不做
for i in range(1, 5):
if i == 3:
break
print (i)
for i in range(1, 5):
if i == 3:
continue
print (i)
for i in range(1, 5):
if i == 3:
pass
print (i)
---------------------------------------------------------排序相关--------------------------------------------------------------------
元组排序
myList = [-1, 2, -3, 4, -5, 6, 7]
print(sorted(myList))#升序排序
print(sorted(myList,reverse=True))#降序排序
### 5.1.5 数据框排序
#### 1. 排序
sample=pd.DataFrame({'name':['Bob','Lindy','Mark','Miki','Sully','Rose'],
'score':[98,78,87,77,77,np.nan],
'group':[1,1,1,2,1,2],})
sample
按'score'这个字段降序排序
sample.sort_values('score',ascending=False,na_position='last')
按'score'这个字段升序排序
sample.sort_values('score',ascending=True,na_position='last')
sample.sort_values(['group','score'])
sample.sort_values(['group','score'])
-------------------------------------------------------数据框分组汇总--------------------------------------
### 5.1.6 数据框分组汇总
sample = pd.read_csv(r'D:\《Python数据科学:技术详解与商业实践》源代码文件\Python_book\5Preprocessing\sample.csv', encoding='gbk')
sample.head()
sample.groupby('class')[['math']].min()
sample.groupby('class') [['math']].max()
sample.groupby('class')[['math']].count()
sample.groupby('class')[['math']].mean()
sample.groupby('class')[['math']].sum()
sample.groupby(['grade','class'])[['math']].mean()#两个字段分组
sample.groupby(['grade'])['math','chinese'].mean()
sample.groupby('class')['math'].agg(['mean','min','max'])
df = sample.groupby(['grade','class'])['math','chinese'].agg(['min','max'])
df
-------------------------------------------------------拆分、堆叠列---------------------------------------------
### 5.1.7 拆分、堆叠列
table = pd.DataFrame({'cust_id':[10001,10001,10002,10002,10003],
'type':['Normal','Special_offer',
'Normal','Special_offer','Special_offer'],
'Monetary':[3608,420,1894,3503,4567]})
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary',
fill_value=0,aggfunc='sum')
table1 = pd.pivot_table(table,index='cust_id',
columns='type',
values='Monetary',
fill_value=0,
aggfunc=np.sum).reset_index()
table1
pd.melt(table1,
id_vars='cust_id',
value_vars=['Normal','Special_offer'],
value_name='Monetary',
var_name='TYPE')
--------------------------------------------------数据框横向连接/纵向连接-------------------------------------------
### 5.1.3 横向连接
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})
#### 1. 数据框内连接,类似inner join
df1.merge(df2,how='inner',left_on='id',right_on='id')
#### 2. 左连接,类似left join
df1.merge(df2,how='left',left_on='id',right_on='id')
df1.merge(df2,how='left',on='id')
#### 3. 右连接,类似right join
df1.merge(df2,how='right',on='id')
#### 3. 行索引连接
df1 = pd.DataFrame({'id1':[1,2,3],
'col1':['a','b','c']},
index = [1,2,3])
df2 = pd.DataFrame({'id2':[1,2,3],
'col2':['aa','bb','cc']},
index = [1,3,2])
pd.concat([df1,df2],axis=1)
df1.join(df2)
### 5.1.4 纵向合并
df1 = pd.DataFrame({'id':[1,1,1,2,3,4,6],
'col':['a','a','b','c','v','e','q']})
df2 = pd.DataFrame({'id':[1,2,3,3,5],
'col':['x','y','z','v','w']})
pd.concat([df1,df2],ignore_index=True,axis=0)
pd.concat([df1,df2],ignore_index=True).drop_duplicates()#纵向合并并且去重
df3 = df1.rename(columns = {'col':'new_col'})#数据框某列改名
pd.concat([df1,df3],ignore_index=True).drop_duplicates()
----------------------------------------------------------------数据整合--------------------------------------
5.1 数据整合
### 5.1.1 行列操作
#### 1. 单列
import pandas as pd
import numpy as np
sample = pd.DataFrame(np.random.randn(4, 5),
columns=['a','b','c','d','e'])
print(sample)
sample['a']#数据框选择单列
sample[['a','b']]#数据框随意选择两列的
sample[0:2]#数据框选择前两行
sample.iloc[1:2,0:2]#数据框iloc选择行与列的写法
sample.ix[:,'a']
sample[['a']]
#### 2. 选择多行和多列
sample.ix[0:2, 0:2]
#### 3. 创建、删除列
sample['new_col1'] = sample['a'] - sample['b']
sample
数据框增加两列
sample_new=sample.assign(new_col2 = sample['a'] - sample['b'],
new_col3 = sample['a'] + sample['b'])
sample_new
sample.drop('a',axis=1)#数据框删除某一列
### 5.1.2 条件查询
sample =pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[98,78,87,77,65,67],
'group':[1,1,1,2,1,2],})
sample
#### 1. 单条件
sample.score > 70
sample[sample.score > 70]
sample[sample.score>70]
#### 2. 多条件
sample[(sample.score > 70) & (sample.group ==1)]
#### 3. 使用query
sample.query('score > 90')
sample.query('(group ==2) |(group == 1)')
#### 4. 其他数据框sample['score'].后面跟聚合函数sum,mean,count,between
sample[sample['score'].between(70,80,inclusive=True)]
sample[sample['name'].isin(['Bob','Lindy'])]
sample[sample['name'].str.contains('[M]+')]#数据框字符过滤,有点像live
### 5.1.8 赋值与条件赋值
#### 1. 赋值
sample = pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[99,78,999,77,77,np.nan],
'group':[1,1,1,2,1,2],})
sample.score.replace(999,np.nan)
sample.replace({'score':{999:np.nan},
'name':{'Bob':np.nan}})
#### 2. 条件赋值
def transform(row):
if row['group'] == 1:
return ('class1')
elif row['group'] == 2:
return ('class2')
sample.apply(transform,axis=1)
sample.assign(class_n = sample.apply(transform,axis=1))
sample = sample.copy()
sample.loc[sample.group==1,'class_n']='class1'
sample.loc[sample.group==2,'class_n']='class2'