Python Data Anlysis Notebook

Python Data Anlysis Notebook

SublimeText File

Data Frame

read file

pd.read_csv(path,index_col=0,chunksize=1000)
np.shape()#读取矩阵维度
pd.DataFrame()
df[['var','var']] #-->dataframe
df[1:4]
df.head() #无数据默认前五条
#how to refer col and row
df.loc[label]
df.loc[[row],[col]] #-->dataframe
df.iloc[index]

iterating over data

result=[]
total=0
for chunck in pd.read_csv(path,chunksize=1000):
    result.append(sum(chunck['xcolumn']))
    #另一种方法
    total+=sum(chunck['x'])
total=sum(result)

NumPy

np.logical_and(x,y)
np.logical_or()
np.logical_not()

Loop

enumerate(iterable) #--> index of the value
for index,value in enumerate(iterable,start=10):
    print(index,value)
loop over a dictionary:
dic={}
for key,value in dic.items():
    print(key,value)
import numpy as np
array1=np.array()
array2=np.array()
array_total=np.array([array1,array2])
for val in np.nditer(array_total):
print(val)

import pandas as pd
df=pd.read.csv()
for label,row in df.iterrows():
    print(label)
    print (row)

add a new column in to DataFrame
for lab,row in df.iterrows():
    df.loc[lab,'name_length']=len(df['country'])
print(df)
#更简单的方法是使用apply function
df['name_length']=df['country'].apply(len)
apply(str.upper)

pseudo-random
import numpy as np
np.random.seed(123)
coin=np.random.randint(0,2)

'''
函数将一个数据集合（链表，元组等）中的所有数据进行下列操作：
用传给 reduce 中的函数 function（有两个参数）先对集合中的第 1、2 个元素进行操作，
得到的结果再与第三个数据用 function 函数运算，最后得到一个结果。
'''
from functools import reduce
reduce(function,iterable)
'''
map() 会根据提供的函数对指定序列做映射。
第一个参数 function 以参数序列中的每一个元素调用 function 函数，
返回包含每次 function 函数返回值的新列表。
'''
map(function,iterable,...)
'''
filter() 函数用于过滤序列，过滤掉不符合条件的元素，返回由符合条件元素组成的新列表。
该接收两个参数，第一个为函数，第二个为序列，
序列的每个元素作为参数传递给函数进行判，然后返回 True 或 False，最后将返回 True 的元素放到新列表中。
'''
filter(function,iterable)

def func():
try:
    return function
except [TypeError]:
    return print()

# 抛出异常
raise ValueError

Iterators

 #iterating over iterables
 word='DA'
 it=iter(word)
 next(it)
 print(*it) #->once per time

 zip() #-> return an iterator of tuples
 list(zip(lst1,lst2))
 print(*zipvarable)

List comprehensions

lst=[1,2,3,4]
lst_add_1=[num+1 for num in lst]
lst=[num for num in range(11)]

conditionals in comprehensions
[num**2 for num in range(10) if num%2==0]

dictionary comprehensions
{num:-num for num in range(10)}

generator->analogous generator object||iterator
(num**2 for num in range(10) if num%2==0)

generator function->yield
def num_sequence(n):
    i=0
    while i<n:
        yield i
        entryi+=1

Importing Data in python

 file_name=open('file.txt',mode='r') #only to read
 #'w' is to write
 text=file_name.read()
 file_name.close()
 with open('datacamp.csv','r') as datacamp #context manager
 file_objeect.readline()
 ! ls #will display the contents of your current directory.

flat files (contrain record,row of fields or attributes, column is feature or attributes)

#can have a header
#numpy:storing numerical data
import numpy as np
filename='MNIST.txt'
data=np.loadtxt(filename,delimiter=',',skiprows=1,usecols=[0,2],dtype=str) #跳过第一行 #只要第一和三列数据

#import with different type of data
np.genfromtext(filename,delimiter=',',names=True,dtype=None)

import pandas as pd
filename='winequality-red.csv'
data=pd.read_csv(filename,nrows=5,header=None)#first 5 rows of the file,there is no header in this file

#convert dataframe to a numpy array
data_array=data.values

Other files

data=pd.ExcelFile(filename)
print(data.sheet_names)#figure sheet name out
df1=data.parse('sheetname')#sheet name
df2=data.parse(0,skiprows=[0],parse_cols[0],names=['Country'])#sheet index

#sas file
import pandas as pd
from sas7bdat import SAS7DBAT
with sas7bdat('file.sas7bdat') as file:
    df_sas=file.to_data_drame()

#stata file_name
import pandas as pd
data=pd.read_stata('file.dta')

#HDF5 file
import h5py
file_name='file.hdf5'
data=h5py.File(file_name,'r')
for key in data.keys():
    print(key)

#matlab file
import scipy.io
scipy.io.loadmat() #- read.mat files
scipy.io.savemet() #- write.mat files

Relational Database

# Open engine in context manager
# Perform query and save results to DataFrame: df

engine = create_engine('sqlite:///Chinook.sqlite')
with engine.connect() as con:
    rs = con.execute("SELECT LastName, Title FROM Employee")
    df = pd.DataFrame(rs.fetchmany(size=3))
    df.columns = rs.keys()
# Print the length of the DataFrame df
print(len(df))
# Print the head of the DataFrame df
print(df.head())

# Import packages
from sqlalchemy import create_engine
import pandas as pd
# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')
# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * From Album", engine)
# Print head of DataFrame
print(df.head())

enumerate #返回数据和其位置

 test=[1,2,3,4,5]
 for i, num in enumerate(test):
 print (i,m)

Clean Data

1. Sublime笔记整理

    # Print the head of df
    print(df.head())
    # Print the tail of df
    print(df.tail())
    # Print the shape of df
    print(df.shape)
    # Print the columns of df
    print(df.columns)
    # Rename the columns
    gapminder_melt.columns = ['country', 'year', 'life_expectancy']
    # Print the info of df,provides important information about a DataFrame
    print(df.info())
    # Print the value_counts for 'State'
    print(df.State.value_counts(dropna=False))
    # Print the value counts for 'Site Fill'
    print(df['Site Fill'].value_counts(dropna=False))

Transpose

# Melt airquality: airquality_melt，The id_vars represent the columns of the data you do not want to melt (i.e., keep it in its current shape),
# while the value_vars represent the columns you do wish to melt into rows.
airquality_melt = pd.melt(frame=df,id_vars=['Month','Day'], value_vars=['Ozone','Solar.R','Wind','Temp']，var_name='measurement', value_name='reading')

Pivot_Table

#（非重复数据）
# Pivot airquality_melt: airquality_pivot
airquality_pivot = airquality_melt.pivot_table(index=['Month','Day'], columns='measurement', values='reading')
#（重复数据）
# Pivot table the airquality_dup: airquality_pivot
airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)

层次化索引（PIVOT之后）

# Print the index of airquality_pivot
print(airquality_pivot.index)
# Reset the index of airquality_pivot: airquality_pivot_reset
airquality_pivot_reset = airquality_pivot.reset_index()

Melting and Parsing

# Melt tb: tb_melt
tb_melt = pd.melt(frame=tb,id_vars=['country', 'year'])
# Create the 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]
# Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')
# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')
# Create the 'type' column
ebola_melt['type'] = ebola_melt['str_split'].str.get(0)
# Create the 'country' column
ebola_melt['country'] = ebola_melt['str_split'].str.get(1)

Concatenation Data

# Concatenate uber1, uber2, and uber3: row_concat
row_concat = pd.concat([uber1,uber2,uber3])
# Concatenate ebola_melt and status_country column-wise: ebola_tidy
ebola_tidy = pd.concat([ebola_melt,status_country],axis=1)

#Tons of files to concat
# Import necessary modules
import glob
import pandas as pd
# Write the pattern: pattern
pattern = '*.csv'
# Save all file matches: csv_files
csv_files = glob.glob(pattern)
# Print the file names
print(csv_files)
# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])
# Print the head of csv2
print(csv2.head())

2. 函数

df.sample() 用法参考

 df.sample(frac=0.5,replace=True,random_state=123) #按比例抽取，是否有序放回,设置随机种子

df.isnull()

 df.isnull().values.any() #.values返回array，.any()返回True False
 df.isnull().sum() #sum null

df.groupby()

    #groupby后面聚类算法__猜测只能为num__
    df.groupby('col_name').sum()

Pandas Foundations

1. Inspectiong Data

  import pandas as pd
  type(df)
  type(df.columns)
  type(df.index)
  df.describe()
  df.shape
  df.columns
  df.index
  df.iloc[:5,:]
    df.corr() #only numbers
  # broadcasting, assigning scala value to col slice broadcasts value to each rows
  df.iloc[::3,-1]=np.nan
  #Series
  low=df['Low']
  type(low)
  low.head()
  lows=low.values
  type(lows)
  #View the first few and last few rows of a DataFrame
  df.head()
  df.tail()

count values

 df['col_name'].value_counts()
 df.col_name.value_counts()

show unique value

 df['col_name'].unique()
 df.col_name.unique()

2. Numpy and Pandas Together

  import numpy as np
  #.values to represent a DataFrame df as a NumPy array.
  np_vals = df.values
  # np.log10() method to compute the base 10 logarithm
  np_vals_log10 = np.log10(np_vals)
  df_log10 = np.log10(df)
  [print(x, 'has type', type(eval(x))) for x in ['np_vals', 'np_vals_log10', 'df', 'df_log10']]

3. Zip list to build a df

  #Zip the 2 lists together into one list of (key,value) tuples: zipped
  zipped = list(zip(list_keys,list_values))
  print(zipped)
  data = dict(zipped)
  df = pd.DataFrame(data)
  print(df)

Labeling Data

  list_labels = ['year', 'artist', 'song', 'chart weeks']
  # Assign the list of labels to the columns attribute: df.columns
  df.columns=list_labels

4. Reading & Saving

Reading a flat file

  df=pd.read_csv(file_name,header=None,names=col_name,na_values={'column_name':[' -1']},parse_date=[[0,1,2]])
  df.index=df['date_time']
  df.index.name='key_date'
  #trimming redundant columns
  cols=['sunspots','definite']
  df=df[cols]

  # header=0 to rename the column labels
  new_labels = ['year','population']
  df2 = pd.read_csv(data_file, header=0, names=new_label

Delimiters, headers, and extensions

  df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')
  # Save the cleaned up DataFrame to a CSV file without the index
  df2.to_csv(file_clean, index=False)
  # Save the cleaned up DataFrame to an excel file without the index
  df2.to_excel('file_clean.xlsx', index=False)

5. Plot

Ployting series using pandas

  # Create a plot with color='red'
  df.plot(color='red')
  # Add a title
  plt.title('Temperature in Austin')
  # Specify the x-axis label
  plt.xlabel('Hours since midnight August 1, 2010')
  # Specify the y-axis label
  plt.ylabel('Temperature (degrees F)')
  # Display the plot
  plt.show()

  #fix scales
  plt.yscale('log')
  #legend
  df.plot(legend=True)
  #axis
  plt.axis(('2001','2002',0,100))
  #saving plot
  plt.savefig()

Plotting DataFrame

  # Plot all columns (default)
  df.plot()
  plt.show()

  # Plot all columns as subplots
  df.plot(subplots=True)
  plt.show()

  # Plot just the Dew Point data
  column_list1 = ['Dew Point (deg F)']
  df[column_list1].plot()
  plt.show()
  # Plot the Dew Point and Temperature data, but not the Pressure data
  column_list2 = ['Temperature (deg F)','Dew Point (deg F)']
  df[column_list2].plot()
  plt.show()

Plots

      # s for size,c for color,alpha for opacity
  df.plot(kind='scatter',x='HP',y='MPG',s=sizes,alpha=,c=)
  df[cols].plot(kind='box',subplots=True)
      # Put the x-axis on a logarithmic scale
      plt.xscale('log')

  # Create a list of y-axis column names: y_columns
  y_columns = ['AAPL','IBM']
  df.plot(x='Month', y=y_columns)
  plt.title('Monthly stock prices')
  plt.ylabel('Price ($US)')
  plt.show()
      plt.hist(x,bins=int)
      # clears the entire current figure与其所有轴，但是窗口打开，这样它可以被重复使用其他地块。
      plt.clf()

  #plotting the PDF, only normed=True; CDF, cumulative=True in addition to normed=True

  # This formats the plots such that they appear on separate rows
  fig, axes = plt.subplots(nrows=2, ncols=1)

  # Plot the PDF
  df.fraction.plot(ax=axes[0], kind='hist', normed=True,bins=30, range=(0,.3))
  plt.show()

  # Plot the CDF
  df.fraction.plot(ax=axes[1], kind='hist', normed=True, bins=30, cumulative=True, range=(0,.3))
  plt.show()

Appendix

#Histogram
# Import matplotlib.pyplot
import matplotlib.pyplot as plt
# Describe the column
df['Existing Zoning Sqft'].describe()
# Plot the histogram
df['Existing Zoning Sqft'].plot(kind='hist', rot=70, logx=True, logy=True)

#Boxplot
# Create the boxplot
df.boxplot(column='initial_cost', by='Borough', rot=90)

#Scatter Plot
# Create and display the first scatter plot
df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)

6. Statistical exploratory data analysis

pandas和numpy涉及到axis参数的问题

参阅： 如何理解pandas中axis参数

  #summary stat of numerical cols of DataFrame
  df.describe()
  df.max()
  df.min()
  df.quantile([0.05,0.95])
  df.std()

  #axis='columns' computes the mean across all columns per row.
  mean = df.mean(axis='columns')
  mean.plot()

Separating populations with Boolean indexing

  df['col'].unique()
  df[df['origin'] == 'US'].count()

小case

  # Display the box plots on 3 separate rows and 1 column
  fig, axes = plt.subplots(nrows=3, ncols=1)

  # Generate a box plot of the fare prices for the First passenger class
  titanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')

  # Generate a box plot of the fare prices for the Second passenger class
  titanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')

  # Generate a box plot of the fare prices for the Third passenger class
  titanic.loc[titanic[ 'pclass']==3].plot(ax=axes[2], y='fare', kind='box')

  # Display the plot
  plt.show()

7. Time Series

Creating DatetimeIndex

  # Prepare a format string: time_format
  time_format = '%Y-%m-%d %H:%M'

  # Convert date_list into a datetime object: my_datetimes
  my_datetimes = pd.to_datetime(date_list, format=time_format)

  # Construct a pandas Series using temperature_list and my_datetimes: time_series
  time_series = pd.Series(temperature_list, index=my_datetimes)

Extracting data using datetime range

  ts1 = ts0.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']
  ts2 = ts0.loc['2010-07-04']
  ts3 = ts0.loc['2010-12-15':'2010-12-31']

Reindexing

Input	Description
'min','T'	minute
'H'	hour
'D'	day
'B'	business day
'W'	week
'M'	month
'Q'	quarter
'A'	year

Resampling time series

Downsampling: reduce datetime rows to slower freq. day->week
Upsampling: increase datetime rows to faster freq. day->hour
```
df.resample('D').mean()
df.resample('4H').ffill()
```

8. 防盗

Seaborn 参考

Format Setting

有5个seaborn的主题
- darkgrid 黑色网格（默认）
- whitegrid 白色网格
- dark 黑色背景
- white 白色背景
- ticks 四周都有刻度线的白背景
```
sns.set_style("whitegrid")
```
画布大小
```
plt.figure(figsize=(7,7))
```
坐标轴倾斜45°
```
plt.xticks(rotation=45)
```
Bar Plot

seaborn.barplot (x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, orient=None, color=None, palette=None, saturation=0.75, errcolor='.26', errwidth=None, capsize=None, dodge=True, ax=None, **kwargs)
- x,y,hue : names of variable in data or vector data
- data : DataFrame,array or list of array,optional
- color :matplotlib color,optional
- palette : palette name,list, or dict,optional 参考调色板教程
- ax : matplotlib Axes,optional
- hue: 类似于label
```
#hue也可以换为['female','male']，palette=sns.cubehelix_palette(120)
ax=sns.barplot(x=data.gender.value_counts().index,y=data.gender.value_counts(), hue=data['gender'].value_counts().index,palette="Greens_d")
plt.legend(loc=8)
plt.xlabel('Gender')
plt.ylabel('Frequency') #另一种写法ax.set_ylabels()
plt.title('Show of Gender Bar Plot')
plt.show()
```

Catplot 参考文章（理解为高级汇总,kind='types')

snsborn.despine

plt.figure(figsize=(10,10))
g=sns.catplot(x='gender',y='math score',data=data,hue='lunch',kind='bar',height=4, aspect=1)
g.despine(left=True) #移除画布框线
plt.tight_layout() #紧缩排列
plt.show()

f,ax=plt.subplots(figsize=(9,10))
sns.barplot(y=data.gender.value_counts().index,x=data.gender.value_counts()
            ,label='gender',alpha=0.5,color='red')
sns.barplot(y=data['race/ethnicity'].value_counts().index,x=data['race/ethnicity'].value_counts(),
              color='blue',label='race',alpha=0.7)
ax.legend(loc='upper right',frameon=True)
ax.set(xlabel='Gender , Race/Ethnicity',ylabel='Groups',title="Gender vs Race/Ethnicity ")
plt.show()

Point Plot

seaborn.pointplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, markers='o', linestyles='-', dodge=False, join=True, scale=1, orient=None, color=None, palette=None, errwidth=None, capsize=None, ax=None, **kwargs)
- x, y, hue : names of variables in data or vector data, optional
- data : DataFrame, array, or list of arrays, optional
- order, hue_order : lists of strings, optional
- markers : string or list of strings, optional
- linestyles : string or list of strings, optional
- color : matplotlib color, optional
- palette : palette name, list, or dict, optional
- ax : matplotlib Axes, optional
```
f,ax1=plt.subplots(figsize=(25,10))
sns.pointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='blue',alpha=0.6)
sns.pointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='pink',alpha=0.2)
ax1.set(xlabel='Group B index State',ylabel='Frequency',title="Group B Math Score & Reading_Score") #也可以用plt.xlabel
plt.xticks(rotation=90) #轴旋转
plt.grid() #加网线
plt.show()
```
```
ax=sns.pointplot(x='Writing_Score',y='Math_Score',hue='Gender',data=data,markers=["o", "x"],linestyles=["-", "--"])
plt.legend(loc='lower right')
plt.xticks(rotation=90)
plt.show()
```

Joint Plot 包含核密度估计 KDE图

seaborn.jointplot(x, y, data=None, kind='scatter', stat_func=None, color=None, height=6, ratio=5, space=0.2, dropna=True, xlim=None, ylim=None, joint_kws=None, marginal_kws=None, annot_kws=None, **kwargs)
- x, y : strings or vectors
- data : DataFrame, optional
- kind : { “scatter” | “reg” | “resid” | “kde” | “hex” }, optional
- color : matplotlib color, optional
- dropna : bool, optional

plt.figure(figsize=(10,10))
#kind='kde' 核密度图
sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='lime',kind='hex',alpha=0.8)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Frequency Race/Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

#散点图+KDE图
data['Race/Ethnicity'].unique()
len(data[(data['Race/Ethnicity']=='group B')].Math_Score)
plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='k').plot_joint(sns.kdeplot, zorder=0, n_levels=6)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Math Score & Reading_Score')
plt.xticks(rotation=90)
plt.show()

KDE Plot (jointplot中可以kind='kde')

seaborn.kdeplot(data, data2=None, shade=False, vertical=False, kernel='gau', bw='scott', gridsize=100, cut=3, clip=None, legend=True, cumulative=False, shade_lowest=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None, **kwargs)
- data : 1d array-like
- data2: 1d array-like, optional
- shade : bool, optional
- vertical : bool, optional
- kernel : {‘gau’ | ‘cos’ | ‘biw’ | ‘epa’ | ‘tri’ | ‘triw’ }, optional
- cut : scalar, optional
- legend : bool, optional
- ax : matplotlib axes, optional

Pie chart

plt.figure(figsize=(7,7))
#explode是偏离轴心（可小数），shadow是阴影
plt.pie(data['Race/Ethnicity'].value_counts().values,explode=[0,0,0.0,0,0.1],labels=data['Race/Ethnicity'].value_counts().index,colors=['pink','grey','yellow','lime','brown'],autopct='%1.1f%%',shadow=True)
plt.title('Race/Ethnicity According Analysis',color='black',fontsize=10)
plt.legend(data['Race/Ethnicity'].value_counts().index,loc=2)
plt.axis('equal') #避免比例压缩为椭圆
plt.show()

LM Plot

seaborn.lmplot(x, y, data, hue=None, col=None, row=None, palette=None, col_wrap=None, height=5, aspect=1, markers='o', sharex=True, sharey=True, hue_order=None, col_order=None, row_order=None, legend=True, legend_out=True, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, x_jitter=None, y_jitter=None, scatter_kws=None, line_kws=None, size=None)
- x, y : strings,options
- data : DataFrame, optional
- hue, col, row : strings
- palette : palette name, list, or dict, optional
- markers : matplotlib marker code or list of marker codes, optional
- legend : bool, optional
- scatter : bool, optional
```
sns.lmplot(x='Math_Score',y='Writing_Score',data=data,hue='Gender',markers=['x','o'])
plt.ylabel('Writing Score')
plt.xlabel('Math Score')
plt.show()
```

Practices

练习resampling

  import numpy as ny
  import pandas as pd
  import matplotlib.pyplot as plt
  df=pd.DataFrame(pd.read_csv('test.csv'))

  #给特定列重命名
  df=df.rename(columns ={'_c1':'date'})

  #删除where col_name is null的rows
  df1=df[df['date'].notnull()]

  #将目标列设置为index
  df1.set_index(['date'],inplace=True)

  #resample只能适用于datetimeindex，因此将index转为datetimeindex格式
  df1.index=pd.DatetimeIndex(df1.index)

  #对第二列开始的数据进行分类求和
  df1.iloc[:,1:].resample('W').sum()

  #对id分组再进行resampling
  df1.groupby('id').resample('W').sum().iloc[:,1:]

Pandas笔记