简单总结
关联分析、数值比较:散点图、曲线图
分布分析:灰度图、密度图
涉及分类的分析:柱状图、箱式图
1.Matplotlib基础知识
使用Python进行数据分析并可视化离不开的一个重要包,matplotlib.这是一个开源的包,里面有非常丰富的可视化图形处理。官方文档 Matplotlib,里面有详细的介绍,以及图形种类。
本文主要是学习Matplotlib的基本使用方法,需要使用到更多的方法是可以详细的阅读官方文档。
安装
a.使用 pip install -U matplotlib b.安装Anaconda 自带这个包
导入包
import matplotlib.pyplot as plt%matplotlib inline #魔法命令行,使用是能直接再输出行显示图形
几种基本图形绘制
x = np.linspace(0, 2, 100)plt.plot(x, x, label='linear')plt.plot(x, x**2, label='quadratic')plt.plot(x, x**3, label='cubic')plt.xlabel('x label')plt.ylabel('y label')plt.title("Simple Plot")plt.legend()plt.show()
def my_plotter(ax, data1, data2, param_dict): """ A helper function to make a graph Parameters ---------- ax : Axes The axes to draw to data1 : array The x data data2 : array The y data param_dict : dict Dictionary of kwargs to pass to ax.plot Returns ------- out : list list of artists added """ out = ax.plot(data1, data2, **param_dict) return out# which you would then use as:data1, data2, data3, data4 = np.random.randn(4, 100)fig, ax = plt.subplots(1, 1)my_plotter(ax, data1, data2, {'marker': 'x'})
子图集
fig, (ax1, ax2) = plt.subplots(1, 2)my_plotter(ax1, data1, data2, {'marker': 'x'})my_plotter(ax2, data3, data4, {'marker': 'o'})
import numpy as npimport matplotlib.pyplot as pltN = 5menMeans = (20, 35, 30, 35, 27)womenMeans = (25, 32, 34, 20, 25)menStd = (2, 3, 4, 1, 2)womenStd = (3, 5, 2, 3, 3)ind = np.arange(N) # the x locations for the groupswidth = 0.35 # the width of the bars: can also be len(x) sequencep1 = plt.bar(ind, menMeans, width, yerr=menStd)p2 = plt.bar(ind, womenMeans, width, bottom=menMeans, yerr=womenStd)plt.ylabel('Scores')plt.title('Scores by group and gender')plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))plt.yticks(np.arange(0, 81, 10))plt.legend((p1[0], p2[0]), ('Men', 'Women'))plt.show()
def scatterplot(x_data, y_data, x_label, y_label, title): fig, ax = plt.subplots() ax.scatter(x_data, y_data, s = 10, color = '#539caf', alpha = 0.75) ax.set_title(title) ax.set_xlabel(x_label) ax.set_ylabel(y_label)scatterplot(x_data = daily_data['temp'] , y_data = daily_data['cnt'] , x_label = 'Normalized temperature (C)' , y_label = 'Check outs' , title = 'Number of Check Outs vs Temperature')
import numpy as npimport matplotlib.pyplot as pltmen_means, men_std = (20, 35, 30, 35, 27), (2, 3, 4, 1, 2)women_means, women_std = (25, 32, 34, 20, 25), (3, 5, 2, 3, 3)ind = np.arange(len(men_means)) # the x locations for the groupswidth = 0.35 # the width of the barsfig, ax = plt.subplots()rects1 = ax.bar(ind - width/2, men_means, width, yerr=men_std, color='SkyBlue', label='Men')rects2 = ax.bar(ind + width/2, women_means, width, yerr=women_std, color='IndianRed', label='Women')# Add some text for labels, title and custom x-axis tick labels, etc.ax.set_ylabel('Scores')ax.set_title('Scores by group and gender')ax.set_xticks(ind)ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))ax.legend()def autolabel(rects, xpos='center'): """ Attach a text label above each bar in *rects*, displaying its height. *xpos* indicates which side to place the text w.r.t. the center of the bar. It can be one of the following {'center', 'right', 'left'}. """ xpos = xpos.lower() # normalize the case of the parameter ha = {'center': 'center', 'right': 'left', 'left': 'right'} offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off for rect in rects: height = rect.get_height() ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height, '{}'.format(height), ha=ha[xpos], va='bottom')autolabel(rects1, "left")autolabel(rects2, "right")plt.show()
2. GAFAJA股票数据可视化分析
(谷歌,亚马逊,Facebook,苹果,京东,阿里巴巴)几大互联网科技巨头的股票数据,可以直接从雅虎财经网站上下载,选择一年的数据进行分析。根据分析我们可以看到科技大佬也随着2018年经济不景气,受到了很大的影响。但是这也许对很多人来说又是一个最佳的投资时期!不过还是有一家公司做到了从年初到年末涨了24.3%得优秀业绩。我们来看看是哪家?
数据下载地址:根据公司名字搜索一下,然后选择Historcal Data,设置查询期间,apply,然后download.
BABA Historical Prices | Alibaba Group Holding Limited A Stock - Yahoo Financefinance.yahoo.com
#coding:utf-8%matplotlibinlineimportpandasaspdimportnumpyasnpimportmatplotlib.pyplotaspltappDF=pd.read_csv(r"Downloads\AAPL.csv")fbDF=pd.read_csv(r"Downloads\FB.csv")jdDF=pd.read_csv(r"Downloads\JD.csv")babaDF=pd.read_csv(r"Downloads\BABA.csv")amznDF=pd.read_csv(r"Downloads\AMZN.csv")googleDF=pd.read_csv(r"Downloads\GOOGL.csv")
APPLE
appDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectappDF['Date']=pd.to_datetime(appDF['Date'])# change datatypeappDF.set_index(appDF.Date,inplace=True)# reset indexappDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)appDF.info()<class'pandas.core.frame.DataFrame'>DatetimeIndex:250entries,2018-01-02to2018-12-28Datacolumns(total7columns):Date250non-nulldatetime64[ns]Open250non-nullfloat64High250non-nullfloat64Low250non-nullfloat64Close250non-nullfloat64AdjClose250non-nullfloat64Volume250non-nullint64dtypes:datetime64[ns](1),float64(5),int64(1)memoryusage:15.6KBappDF.describe()
Open High Low Close Adj Close Volume count 250.000000 250.000000 250.000000 250.000000 250.000000 2.500000e+02 mean 189.233760 191.120640 187.306320 189.178680 187.908454 3.401187e+07 std 20.456809 20.528269 20.387978 20.539151 20.836598 1.465514e+07 min 148.149994 151.550003 146.589996 146.830002 146.830002 1.251390e+07 25% 173.452503 174.962498 172.080002 173.472500 171.660324 2.351965e+07 50% 186.319999 187.534996 184.965003 186.180001 185.077881 3.161740e+07 75% 207.840000 209.437500 205.937496 207.875003 206.795952 4.077780e+07 max 230.779999 233.470001 229.779999 232.070007 231.263092 9.624670e+07
'''定义函数函数功能:计算股票涨跌幅=(现在股价-买入价格)/买入价格输入参数:column是收盘价这一列的数据返回数据:涨跌幅'''defchange(column):#买入价格buyPrice=column[0]#现在股价#column.size是总共数据条数,序号是从0开始的,所以最后一条数据的序号是总数目-1curPrice=column[column.size-1]#累计涨跌幅priceChange=(curPrice-buyPrice)/buyPrice#判断股票是上涨,还是下跌if(priceChange>0):print('股票累计上涨=',priceChange*100,'%')elif(priceChange==0):print('股票累没有变化=',priceChange*100,'%')else:print('股票累计下跌',priceChange*100,'%')#返回数据returnpriceChangecloseCol=appDF['Close']appChange=change(closeCol)股票累计下跌-9.305700374599455%
fbDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectfbDF['Date']=pd.to_datetime(fbDF['Date'])# change datatypefbDF.set_index(fbDF.Date,inplace=True)# reset indexfbDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)fbDF.describe()
Open High Low Close Adj Close Volume count 250.000000 250.000000 250.000000 250.000000 250.000000 2.500000e+02 mean 171.621040 173.769240 169.460560 171.672640 171.672640 2.766240e+07 std 19.595486 19.305557 19.958815 19.852291 19.852291 1.920073e+07 min 123.099998 129.740005 123.019997 124.059998 124.059998 9.588600e+06 25% 157.847503 160.939995 156.077495 158.142494 158.142494 1.779380e+07 50% 175.010002 177.040001 172.875000 174.794998 174.794998 2.179760e+07 75% 184.922493 186.510006 183.424996 185.289998 185.289998 3.031708e+07 max 215.720001 218.619995 214.270004 217.500000 217.500000 1.698037e+08
closeCol=fbDF['Close']fbChange=change(closeCol)股票累计下跌-26.57920931076187%
JingDong
jdDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectjdDF['Date']=pd.to_datetime(jdDF['Date'])# change datatypejdDF.set_index(jdDF.Date,inplace=True)# reset indexjdDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)closeCol=jdDF['Close']jdChange=change(closeCol)股票累计下跌-49.838263628425686%
Alibaba
babaDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectbabaDF['Date']=pd.to_datetime(babaDF['Date'])# change datatypebabaDF.set_index(babaDF.Date,inplace=True)# reset indexbabaDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)closeCol=babaDF['Close']babaChange=change(closeCol)股票累计下跌-24.26354448996062%
AMAZON
amznDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectamznDF['Date']=pd.to_datetime(amznDF['Date'])# change datatypeamznDF.set_index(amznDF.Date,inplace=True)# reset indexamznDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)closeCol=amznDF['Close']amznChange=change(closeCol)股票累计上涨=24.306776862206565%(amznDF.Close[-1]-amznDF.Close[0])/amznDF.Close[0]*100# amazon24.306776862206565amznDF.head(1)
Date Open High Low Close Adj Close Volume Date 2018-01-02 2018-01-02 1172.0 1190.0 1170.51001 1189.01001 1189.01001 2694500
amznDF.tail(1)
Date Open High Low Close Adj Close Volume Date 2018-12-28 2018-12-28 1473.349976 1513.469971 1449.0 1478.02002 1478.02002 8825600
googleDF.dtypesDateobjectOpenfloat64Highfloat64Lowfloat64Closefloat64AdjClosefloat64Volumeint64dtype:objectgoogleDF['Date']=pd.to_datetime(googleDF['Date'])# change datatypegoogleDF.set_index(googleDF.Date,inplace=True)# reset indexgoogleDF.indexDatetimeIndex(['2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-08','2018-01-09','2018-01-10','2018-01-11','2018-01-12','2018-01-16',...'2018-12-14','2018-12-17','2018-12-18','2018-12-19','2018-12-20','2018-12-21','2018-12-24','2018-12-26','2018-12-27','2018-12-28'],dtype='datetime64[ns]',name='Date',length=250,freq=None)closeCol=googleDF['Close']googleChange=change(closeCol)股票累计下跌-2.472014607028055%babaDF.plot(x=babaDF.index,y='Close')#x坐标轴文本plt.xlabel('时间')#y坐标轴文本plt.ylabel('股价(美元)')#图片标题plt.title('2018年阿里巴巴股价走势')#显示网格plt.grid(True)#显示图形plt.show()
babaDF.plot(x='Volume',y='Close',kind='scatter')#x坐标轴文本plt.xlabel('成交量')#y坐标轴文本plt.ylabel('股价(美元)')#图片标题plt.title('成交量和股价')#显示网格plt.grid(True)#显示图形plt.show()
babaDF.corr()
Open High Low Close Adj Close Volume Open 1.000000 0.995051 0.993075 0.985336 0.985336 -0.133665 High 0.995051 1.000000 0.994309 0.993553 0.993553 -0.106145 Low 0.993075 0.994309 1.000000 0.994654 0.994654 -0.168921 Close 0.985336 0.993553 0.994654 1.000000 1.000000 -0.145040 Adj Close 0.985336 0.993553 0.994654 1.000000 1.000000 -0.145040 Volume -0.133665 -0.106145 -0.168921 -0.145040 -0.145040 1.000000
#绘制谷歌的画纸1ax1=googleDF.plot(x=googleDF.index,y='Close',label='谷歌')#通过指定画纸ax,在同一张画纸上绘图#亚马逊amznDF.plot(ax=ax1,x=amznDF.index,y='Close',label='亚马逊')#FacebookfbDF.plot(ax=ax1,x=fbDF.index,y='Close',label='Facebook')#苹果appDF.plot(ax=ax1,x=appDF.index,y='Close',label='苹果')#阿里巴巴babaDF.plot(ax=ax1,x=babaDF.index,y='Close',label='阿里巴巴')#腾讯jdDF.plot(ax=ax1,x=jdDF.index,y='Close',label='京东')#x坐标轴文本plt.xlabel('时间')#y坐标轴文本plt.ylabel('股价(美元)')#图片标题plt.title('2018年GAFATA股价累计涨幅比较')#显示网格plt.grid(True)plt.show()
#绘制谷歌的画纸1ax2=googleDF.plot(x=googleDF.index,y='Close',label='谷歌')#通过指定画纸ax,在同一张画纸上绘图#亚马逊amznDF.plot(ax=ax2,x=amznDF.index,y='Close',label='亚马逊')plt.xlabel('时间')#y坐标轴文本plt.ylabel('股价(美元)')#图片标题plt.title('2018年GAFATA股价累计涨幅比较')#显示网格plt.grid(True)plt.show()
#Facebookax3=fbDF.plot(x=fbDF.index,y='Close',label='Facebook')#苹果appDF.plot(ax=ax3,x=appDF.index,y='Close',label='苹果')#阿里巴巴babaDF.plot(ax=ax3,x=babaDF.index,y='Close',label='阿里巴巴')#腾讯jdDF.plot(ax=ax3,x=jdDF.index,y='Close',label='京东')#x坐标轴文本plt.xlabel('时间')#y坐标轴文本plt.ylabel('股价(美元)')#图片标题plt.title('2018年GAFATA股价累计涨幅比较')#显示网格plt.grid(True)plt.show()
#6家公司股票收盘价平均值gafataMeanList=[googleDF['Close'].mean(),#谷歌amznDF['Close'].mean(),#亚马逊fbDF['Close'].mean(),#FacebookappDF['Close'].mean(),#苹果babaDF['Close'].mean(),#阿里巴巴jdDF['Close'].mean()#腾讯]#创建pandas一维数组SeriesgafataMeanSer=pd.Series(gafataMeanList,index=['谷歌','亚马逊','Facebook','苹果','阿里巴巴','腾讯'])gafataMeanSer.plot(kind='bar',label='GAFAJA')#图片标题plt.title('2018年GAFAJA股价平均值')#x坐标轴文本plt.xlabel('公司名称')#y坐标轴文本plt.ylabel('股价平均值(美元)')plt.grid(True)plt.show()
#存放6家公司的收盘价closeDf=pd.DataFrame()#合并6家公司的收盘价closeDf=pd.concat([closeDf,googleDF['Close'],#谷歌amznDF['Close'],#亚马逊fbDF['Close'],#FacebookappDF['Close'],#苹果babaDF['Close'],#阿里巴巴jdDF['Close']#JD],axis=1)#重命名列名为公司名称closeDf.columns=['谷歌','亚马逊','Facebook','APPLE','阿里巴巴','JD']closeDf.head()
谷歌 亚马逊 Facebook APPLE 阿里巴巴 JD Date 2018-01-02 1073.209961 1189.010010 181.419998 172.259995 183.649994 43.279999 2018-01-03 1091.520020 1204.199951 184.669998 172.229996 184.000000 43.509998 2018-01-04 1095.760010 1209.589966 184.330002 173.029999 185.710007 43.669998 2018-01-05 1110.290039 1229.140015 186.850006 175.000000 190.699997 45.639999 2018-01-08 1114.209961 1246.869995 188.279999 174.350006 190.330002 46.099998
closeDf.plot(kind='box')plt.grid(True)plt.show()
股票总结:在2018年,京东的表现最差,股票总计快跌了一半,总市值还剩300亿。亚马逊表现最好,上涨24.3%,总市值7200多亿。其他几个股票都下行了!苹果在去年最好的市值达到了一万亿,现在7400多亿。