Read several csv into a dictionary
import pandas as pd
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for item in data_files:
path = 'schools/'+item
keyname = item.split(".")[0] # get the filename before '.'
data[keyname] = pd.read_csv(path)
Good resource for encoding
reference
Read files with other encoding
all_survey = pd.read_csv("schools/survey_all.txt", delimiter = "\t", encoding = "windows-1252")
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter = "\t", encoding = "windows-1252")
survey = pd.concat([all_survey, d75_survey], axis = 0) # new rows = row_x + row_y
print (survey.head(5))
Change column names and select a few columns from original
survey["DBN"] = survey["dbn"]
columns_to_keep = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]
survey = survey.loc[:, columns_to_keep] # select a few columns
data["survey"] = survey
**apply function & lambda **
Create a new column called padded_csd, to turn original csd like '1' to '01', '3' to '03' while keep it as it is when len(csd) > 1.
data["class_size"]['padded_csd'] = data["class_size"]['CSD'].apply(lambda s: str(s).zfill(2)) #could use lambda here
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
Another way to do lambda -- apply function
def padding(num):
a = str(num)
if len(a) == 1:
return a.zfill(2)
else:
return a
data["class_size"]['padded_csd'] = data["class_size"]['CSD'].apply(padding)
pd.to_numeric()
df["new_score"] = pd.to_numeric(df["score"], errors = "coerce")
df.loc[5, ["new_score"]] # print the first 5 rows
Group by
import numpy
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
print(data["class_size"].head())
Select rows based on column values
temp = data["graduation"]
temp = temp.loc[(temp["Cohort"]== '2006') & (temp["Demographic"] == "Total Cohort")]
data["graduation"] = temp
print (data["graduation"].head(5))
merge
combined = combined.merge(data["class_size"], on = "DBN", how = "inner")
fillna
means = combined.mean()
combined = combined.fillna(means)
combined = combined.fillna(value = 0)
print (combined.head(5))
pd.corr()
Quickly to find the correlations between columns
see doc
correlations = combined.corr() # the r value
print (correlations["sat_score"])
# draw a bar chart for corr between sat_scores and other survey fields
combined.corr()["sat_score"][survey_fields].plot.bar()
df.plot
import matplotlib.pyplot as plt
combined.plot.scatter(x="total_enrollment", y="sat_score")
draw school locations with color indicates the values of another column
from mpl_toolkits.basemap import Basemap
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = combined["lon"].tolist()
latitudes = combined["lat"].tolist()
# show value of 'ell_percent' column
m.scatter(longitudes, latitudes, s=20, zorder=2, latlon=True,
c = combined["ell_percent"], cmap= "summer")
plt.show()
Apply Function
for avenger data practice
def clean_deaths(row):
num_deaths = 0
columns = ['Death1', 'Death2', 'Death3', 'Death4', 'Death5']
for c in columns:
death = row[c]
if pd.isnull(death) or death == 'NO':
continue
elif death == 'YES':
num_deaths += 1
return num_deaths
true_avengers['Deaths'] = true_avengers.apply(clean_deaths, axis=1)
OR could do
death_col = list()
death_col += ['Death' + str(i) for i in range(1,6)]
true_avengers['Deaths'] = (true_avengers[death_col] == 'YES').sum(axis = 1)
check if the column value is correct
joined_accuracy_count = int()
correct_joined_years = true_avengers[true_avengers['Years since joining'] == (2015 - true_avengers['Year'])]
joined_accuracy_count = len(correct_joined_years)