LSTM(Long Short-Term Memory)是长短期记忆网络,是一种时间递归神经网络,适合用来处理时间序列数据,在时间序列的预测问题中表现尤为突出。下图是百度百科中的LSTM“三重门”结构:
时间序列是指按时间顺序排列的、随时间变化的且相互关联的数据序列。实际中遇到的时间序列往往有三个主要特性:趋势性、季节性和非平稳性,而时间序列分析一般都是预测问题。
1、数据集:北京的连续2360天的温度数据
(原始数据好像上传不了,直接上图好了。)
2、实验
实验部分可分为四部分:数据预处理、建模、模型训练和预测。
实验中,按照训练集:测试集 = 9:1的比例做数据集划分,滑动窗口取了20,也可以理解为这20个点中,前19个点作为data,最后一个点作为label,LSTM的网络结构如下:
实验结果貌似还不错,测试集包含234个点,训练150轮,在非归一化的数据上,loss收敛到了6左右,下图是预测的2016.11.9到2017.6.30之间234天的每日最高温与当日真实值的对比图。蓝线表示真实值,橙线表示预测值。
既然测试集上效果还比较理想,索性开一下脑洞,预测出未来31天的最高温,反正没有真实值做对比,结果飞了就飞了,坐等打脸了[捂脸]。
# 预测得到的7月份30天每日最高温数值
predict_series : [33.0, 32.0, 33.0, 32.0, 31.0, 32.0, 32.0, 31.0, 31.0, 32.0, 31.0, 31.0, 32.0, 30.0, 31.0, 31.0, 30.0, 30.0, 30.0, 30.0, 30.0, 29.0, 29.0, 29.0, 29.0, 29.0, 28.0, 28.0, 28.0, 28.0, 30.0]
# coding: utf-8
#预测7月份北京的每日最高温
__author__ = "孙欢"
import xlrd
import xlwt
import os
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from numpy import newaxis
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
#变量声明
day_high = [] #一天中的最高温度
day_low = [] #一天中的最低温度
predict_series = []
result = []
day_new = []
out_put = []
#加载数据
def load_data(xls_file):
data = xlrd.open_workbook(xls_file)
for index in data.sheets():
table = data.sheet_by_name(index.name)#通过名称获取
nrows = table.nrows #获取当前列表的行数
for row in range(nrows):
if (row != 0):
day_high.append(table.row_values(row)[1]) #一天中的最高温度
day_low.append(table.row_values(row)[2]) #一天中的最低温度
#数据预处理
def data_preprocess(data0):
data = data0
#data.append(data[-1])
#print(data[-1])
sequence_length = 20
result = []
for index in range(len(data) - sequence_length):
result.append(data[index: index + sequence_length]) #得到长度为sequence_length 的向量,最后一个作为label
result = np.array(result)
#print(len(result))
#print("result:",result)
row = round(0.9 * result.shape[0])
train = result[:row, :]
#print("train:",train)
np.random.shuffle(train)
x_train = train[:, :-1]
#print(x_train)
y_train = train[:, -1]
x_test = result[row:, :-1]
y_test = result[row:, -1]
#print(x_test)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
return [x_train, y_train, x_test, y_test]
#构建LSTM模型
def build_model(layers): #layers [1,50,100,1]
model = Sequential()
#Stack LSTM
model.add(LSTM(input_dim=layers[0],output_dim=layers[1],return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(layers[2],return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print("Compilation Time : ", time.time() - start)
return model
#直接预测
def predict_point_by_point(model, data):
predicted = model.predict(data)
print('predicted shape:',np.array(predicted).shape) #(412L,1L)
predicted = np.reshape(predicted, (predicted.size,))
return predicted
#画图
def plot_results(predicted_data, true_data, filename):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
plt.plot(predicted_data, label='Prediction')
plt.legend()
plt.show()
#plt.savefig(filename+'.png')
#list解嵌套
def nested_list(list_raw,result):
for item in list_raw:
if isinstance(item, list):
nested_list(item,result)
else:
result.append(item)
return result
#main
load_data("北京.xls") #加载数据
model = build_model([1,50,100,1]) #构建模型
day_new = day_high
for i in range(31):
X_train, y_train, X_test, y_test = data_preprocess(day_new)
model.fit(X_train,y_train,batch_size=512,nb_epoch=150,validation_split=0.05)
point_by_point_predictions = predict_point_by_point(model, X_test)
#print(point_by_point_predictions)
result = nested_list(point_by_point_predictions,result)
predict_series.append(round(result[-1],0))
print(predict_series)
day_new.append(round(result[-1],0))
print(len(day_new))
#print(day_new)
print(predict_series) #预测出的7月份连续31天的每日最高温