5.1 便捷数据获取
- 便捷网络数据
直接利用matplotlib.finance模块进行数据获取,详情可见matplotlib技术文档,另温故已有的两次数据获取方式
#cnn_stock
import re
import requests
import pandas as pd
def strievieDijList():
r = requests.get('http://money.cnn.com/data/dow30/')
pattern_s = re.compile('class="wsod_symbol">(.*?)<\/a>.*?<span title=.*?">(.*?)<\/span>.*?\n.*?class="wsod_stream">(.*?)<\/span>')
#class="wsod_symbol">(.*?)<\/a>.*?<span.*?">(.*?)<\/span>.*?\n.*?class="wsod_stream">(.*?)<\/span>
dijListInText = re.findall(pattern_s, r.text)
return dijListInText
djiList = strievieDijList()
dataFrame = pd.DataFrame(djiList,columns = ['Abbr','Name','Price'])
print(dataFrame)
#quotes_history
import requests
import re
import json
import pandas as pd
from datetime import date
def retrieve_quotes_historical(stock_code):
quotes = []
url = 'https://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
r = requests.get(url)
m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
if m:
quotes = json.loads(m[0])
quotes = quotes[::-1]
return [item for item in quotes if not 'type' in item]
quotes = retrieve_quotes_historical('AXP')
list1 = []
for i in range(len(quotes)):
x = date.fromtimestamp(quotes[i]['date'])
y = date.strftime(x, '%y-%m-%d')
list1.append(y)
quotesdf_ori = pd.DataFrame(quotes, index = list1)
quotesdf = quotesdf_ori.drop(['date'], axis = 1)
#quotesdf = quotesdf_ori.drop(['unadjclose'], axis = 1)
print(quotesdf)
5.2 数据准备
对数据的前期整理可以对后期分析工作起到很大帮助
- 修改列索引和行索引
#更新列明
cols = ['code', 'name', 'price']
dataFrame.columns = cols
#更新index序号
dataFrame.index = range(1, len(dataFrame) + 1)
- 修改日期形式
#绝对时间转化
from datetime import date
firstday = date.fromtimestamp(1464010200)
#转化成固定的是时间格式
finalday = date.strftime(firstday, '%Y-%m-%d')
- 创建时间序列
import pandas as pd
dates = pd.date_range('20170520', periods = 7)
import numpy as np
datesdf = pd.DataFrame(np.random.randn(7,3), index = dates, columns = list('ABC'))
print(datesdf)
Out[13]:
A B C
2017-05-20 1.135092 0.163002 -1.232315
2017-05-21 -0.017070 -0.429692 0.090594
2017-05-22 -0.811601 -0.859939 -0.290507
2017-05-23 -0.332632 1.147155 -0.393566
2017-05-24 -0.823202 -0.094191 1.420667
2017-05-25 0.499164 -0.418811 -0.675755
2017-05-26 0.713523 -0.445534 -0.392662
5.3 数据显示
对数据的结果进行一次查验,可以对数据的正误进行一次查看,一面后期程序进行出现问题
- 行索引
list(dataFrame.index)
- 列索引
list(dataFrame.columns)
- 查看值
dataFrame.values
- 数据描述查看
dataFrame.describe
- 查看数值基本类型是否对应
dataFrame.lasttrade
5.4 数据选择
在数据分析前需要对数据对象进行选择
- 选择行
- 选择列
- 选择区域
- 筛选(条件选择)
#筛选时间
quotesdf['18-03-15':'18-03-27']
Out[8]:
adjclose close high low open volume
18-03-15 94.389999 94.389999 95.410004 93.959999 95.059998 3213000
18-03-16 95.610001 95.610001 96.389999 94.269997 94.550003 5124900
18-03-19 94.839996 94.839996 95.480003 93.800003 95.440002 2610400
18-03-20 94.949997 94.949997 95.650002 94.720001 95.349998 2501600
18-03-21 94.739998 94.739998 96.269997 94.400002 94.620003 2124000
18-03-22 91.410004 91.410004 93.959999 91.120003 93.739998 4113900
18-03-23 90.449997 90.449997 92.360001 90.410004 91.820000 5100400
18-03-26 92.620003 92.620003 92.989998 90.800003 91.370003 2769800
18-03-27 91.419998 91.419998 93.739998 90.870003 92.809998 4306200
#选中单列
quotesdf['close']
#或者
quotesdf.close
#选中多行多列,loc类
dataFrame.loc[1:5,['Name','Price']]
Out[19]:
Name Price
1 American Express 91.42
2 Apple 168.34
3 Boeing 321.12
4 Caterpillar 146.99
5 Chevron 114.66
#选中多行列,iloc类
dataFrame.iloc[1:5,[0,2]]
Out[20]:
Abbr Price
1 AXP 91.42
2 AAPL 168.34
3 BA 321.12
4 CAT 146.99
#选中确定的某个数值
dataFrame.iat[1,2]
5.5简单统计与处理
简单统计与筛选
- 求平均值
dataFrame.Price.mean()
Out[58]: 116.14099999999999
- 条件筛选
dataFrame[dataFrame.Price >= 120].Name
Out[5]:
0 3M
2 Apple
3 Boeing
4 Caterpillar
12 Goldman Sachs
13 Home Depot
14 IBM
16 Johnson & Johnson
18 McDonald's
24 Travelers Companies Inc
25 United Technologies
26 UnitedHealth
Name: Name, dtype: object
#统计AXP股价上涨下跌的天数
len(quotesdf[quotesdf.close > quotesdf.open])
len(quotesdf[quotesdf.close <= quotesdf.open])
#统计AXP一年相邻两天收盘情况
status = np.sign(np.diff(quotesdf.close))
status[np.where(status == 1)].size
Out[11]: 132
status[np.where(status == -1)].size
Out[12]: 114
- 排序
DataFrame.sort_values(by = , ascending = False)逆序排列
tempdf = dataFrame.sort_values(by = 'Price', ascending = False)
tempdf
Out[17]:
Abbr Name Price
3 BA Boeing 327.88
12 GS Goldman Sachs 251.86
0 MMM 3M 219.52
26 UNH UnitedHealth 214.00
13 HD Home Depot 178.24
2 AAPL Apple 167.78
18 MCD McDonald's 156.38
14 IBM IBM 153.43
4 CAT Caterpillar 147.38
24 TRV Travelers Companies Inc 138.86
16 JNJ Johnson & Johnson 128.15
25 UTX United Technologies 125.82
28 V Visa 119.62
5 CVX Chevron 114.04
17 JPM JPMorgan Chase 109.97
8 DIS Disney 100.44
1 AXP American Express 93.28
20 MSFT Microsoft 91.27
29 WMT Wal-Mart 88.97
23 PG Procter & Gamble 79.28
10 XOM Exxon Mobil 74.61
21 NKE Nike 66.44
9 DWDP DowDuPont Inc 63.71
19 MRK Merck 54.47
15 INTC Intel 52.08
27 VZ Verizon 47.82
7 KO Coca-Cola 43.43
6 CSCO Cisco 42.89
22 PFE Pfizer 35.49
11 GE General Electric 13.48
tempdf[:3].Name
Out[18]:
3 Boeing
12 Goldman Sachs
0 3M
Name: Name, dtype: object
- 计数统计
#统计1月开盘天数
t = quotesdf[(quotesdf.index >= '2017-01-01') & (quotesdf.index < '2017-02-01')]
len(t)
#统计每个月的股票开盘天数
#运用到strptime()方法
import time
...
listtemp = []
for i in range(len(quotesdf)):
temp = time.strptime(quotesdf.index[i], '%y-%m-%d')
listtemp.append(temp.tm_mon)
tempdf = quotesdf.copy()
tempdf['month'] = listtemp
print(tempdf['month'].value_counts())
5.6 Grouping
用分组的思想进行我们的统计
DataFrame.groupby()
x = tempdf.groupby('month').count()
print(x)
Out[8]:
close high low open volume
month
1 21 21 21 21 21
2 19 19 19 19 19
3 21 21 21 21 21
4 19 19 19 19 19
5 22 22 22 22 22
6 22 22 22 22 22
7 20 20 20 20 20
8 23 23 23 23 23
9 20 20 20 20 20
10 22 22 22 22 22
11 21 21 21 21 21
12 20 20 20 20 20
#任意选择一个属性进行月数的统计
print(x.close)
month
1 21
2 19
3 21
4 19
5 22
6 22
7 20
8 23
9 20
10 22
11 21
12 20
#统计每个月的成交量,同理还有mean()、min()、max()等方法
tempdf.groupby('month').sum().volume
#高效统计每个月的成交量
tempdf.groupby('month').volume.sum()
问题的核心在于用什么样的方式来处理相对应的字段名
5.7 Merge
- append 追加
#把AXP公司本年度1月1日至1月5日间的股票交易信息合并到近一年中前两天的股票信息
p = quotesdf[:2]
print(p)
Out[23]:
adjclose close high low open volume
17-04-03 77.401367 78.589996 79.180000 77.970001 79.169998 3022700
17-04-04 77.076370 78.260002 78.610001 78.150002 78.489998 2563700
q = quotesdf['18-01-01':'18-01-03']
print(q)
Out[25]:
adjclose close high low open volume
18-01-02 98.592148 98.940002 99.730003 98.220001 99.730003 2746700
18-01-03 99.200005 99.550003 99.760002 99.019997 99.239998 2976400
p.append(q)
Out[26]:
adjclose close high low open volume
17-04-03 77.401367 78.589996 79.180000 77.970001 79.169998 3022700
17-04-04 77.076370 78.260002 78.610001 78.150002 78.489998 2563700
18-01-02 98.592148 98.940002 99.730003 98.220001 99.730003 2746700
18-01-03 99.200005 99.550003 99.760002 99.019997 99.239998 2976400
- conact 连接
#将AXP公司近一年股票数据中前五个和后五个合并
pieces = [tempdf[:5], tempdf[len(tempdf)-5:]]
pd.concat(pieces)
Out[35]:
close high low open volume month
17-04-03 78.589996 79.180000 77.970001 79.169998 3022700 4
17-04-04 78.260002 78.610001 78.150002 78.489998 2563700 4
17-04-05 77.760002 79.029999 77.660004 78.589996 2858400 4
17-04-06 77.919998 78.300003 77.150002 77.760002 2914800 4
17-04-07 77.769997 78.239998 77.370003 77.480003 2203000 4
18-03-23 90.449997 92.360001 90.410004 91.820000 5100400 3
18-03-26 92.620003 92.989998 90.800003 91.370003 2769800 3
18-03-27 91.419998 93.739998 90.870003 92.809998 4357300 3
18-03-28 92.209999 93.540001 91.599998 91.760002 5509700 3
18-03-29 93.279999 94.370003 92.290001 92.389999 4914600 3
#利用concat连接不同数据类型
pieces1 = quotesdf[:3]
pieces2 = tempdf[:3]
pd.concat([pieces1, pieces2])
Out[38]:
adjclose close high low month open \
17-04-03 77.401367 78.589996 79.180000 77.970001 NaN 79.169998
17-04-04 77.076370 78.260002 78.610001 78.150002 NaN 78.489998
17-04-05 76.898369 77.760002 79.029999 77.660004 NaN 78.589996
17-04-03 NaN 78.589996 79.180000 77.970001 4.0 79.169998
17-04-04 NaN 78.260002 78.610001 78.150002 4.0 78.489998
17-04-05 NaN 77.760002 79.029999 77.660004 4.0 78.589996
volume
17-04-03 3022700
17-04-04 2563700
17-04-05 2858400
17-04-03 3022700
17-04-04 2563700
17-04-05 2858400
pd.concat([pieces1,pieces2],ignore_index = True)#即不使用连接轴上的信息
Out[39]:
adjclose close high low month open volume
0 77.401367 78.589996 79.180000 77.970001 NaN 79.169998 3022700
1 77.076370 78.260002 78.610001 78.150002 NaN 78.489998 2563700
2 76.898369 77.760002 79.029999 77.660004 NaN 78.589996 2858400
3 NaN 78.589996 79.180000 77.970001 4.0 79.169998 3022700
4 NaN 78.260002 78.610001 78.150002 4.0 78.489998 2563700
5 NaN 77.760002 79.029999 77.660004 4.0 78.589996 2858400
- join(SQL类型) 连接
#和数据库中的join概念相同,join前提是要有共同字段
pd.merge(diff.drop(['lasttrade'], axis = 1), AKdf, on = 'code') #该表未实际生成,此处了解即可