import pandasas pd
import numpyas np
import seabornas sns
import matplotlib.pyplotas plt
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000000000)
pd.set_option('display.width', 100000)
income = pd.read_excel(r'D:\bigData\0629demo\dataSource\income.xlsx')
fill_data = income.fillna(value={'workclass': income.workclass.mode()[0], 'occupation': income.occupation.mode()[0],
'native-country': income['native-country'].mode()[0]}, inplace=True)
# print(income.apply(lambda x: np.sum(x.isnull())))
# print(income)
print(income.describe())
print(income.describe(include=['object']))
# 设置绘图风格
plt.style.use('ggplot')
# 设置多图形组合
fig, axes = plt.subplots(2, 1)
# 绘制不同收入水平下的年龄核密度图
# kind='kde', label='<=50K', ax=axes[0], legend=True, linestyle='-'
# kind='kde', label='>50K', ax=axes[0], legend=True, linestyle='--'
income['age'][income.income ==' <=50K'].plot(kind='kde', ax=axes[0], label='<=50K', legend=True, linestyle='-')
income['age'][income.income ==' >50K'].plot(kind='kde', ax=axes[0], label='>50K', legend=True, linestyle='--')
# 绘制不同收入水平下的周工作小时数核密度图
# kind='kde', label='<= 50K', ax=axes[1], legend=True, linestyle='-'
# kind='kde', label='> 50K', ax=axes[1], legend=True, linestyle='--'
income['hours-per-week'][income.income ==' <=50K'].plot(kind='kde', label='<= 50K', ax=axes[1], legend=True,
linestyle='-')
income['hours-per-week'][income.income ==' >50K'].plot(kind='kde', label='> 50K', ax=axes[1], legend=True,
linestyle='--')
plt.show()
# 构造不同收入水平下各种族人数的数据
race = pd.DataFrame(income.groupby(by=['race', 'income']).agg(np.size).loc[:, 'age'])
# 重设行索引
race = race.reset_index()
# 变量重命名
race.rename(columns={'age':'counts'}, inplace=True)
print(race)
# 排序
race.sort_values(by=['race', 'counts'], ascending=False, inplace=True)
# 构造不同收入水平下各家庭关系人数的数据
relationship = pd.DataFrame(income.groupby(by=['relationship', 'income']).agg(np.size).loc[:, 'age'])
relationship = relationship.reset_index()
relationship.rename(columns={'age':'counts'}, inplace=True)
relationship.sort_values(by=['relationship', 'counts'], ascending=False, inplace=True)
plt.figure(figsize=(15, 10))
sns.barplot(x='race', y='counts', hue='income', data=race)
plt.show()
plt.figure(figsize=(15, 10))
sns.barplot(x='relationship', y='counts', hue='income', data=relationship)
plt.show()