李宏毅大佬机器学习第一次作业
本文在于详细讲解第一次 Assignment
Sample Code
steps:
1, use csv to read file
2, parse training data to 5652 pairs of (x, y)
3, training using gradient descent, adagrad
4, predict testing pm2.5
1,import library
import csv
import numpy as np
import matplotlib.pyplot as plt
import math
2,read data
data = []
# 每一个维度存储一种污染物的资讯
for i in range(18):
data.append([])
# print(data) [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] 18个 -> []
# print(np.array(data).shape) # (18, 0)
n_row = 0
text = open(r'hw1\train.csv', 'r', encoding = 'big5')
# 此处 delimiter = ',' 有没有无所谓,这句话用来当一行中用 ','时分割句子的
row = csv.reader(text, delimiter = ',')
for r in row:
#print(r) # 输出 train.csv 中的每一行
# 第 0 条 没有信息
if n_row != 0:
# 每一列只有第 3 ~ 27 格有数值(1天内24小时的数值)
for i in range(3, 27):
if r[i] != 'NR': # 注意此处 n_row - 1是因为,train.csv的第一行是列标签
data[(n_row - 1) % 18].append(float(r[i]))
else:
data[(n_row - 1) % 18].append(float(0))
n_row = n_row + 1
text.close()
# training.csv一共有 4320 条数据,每一天有 18个维度测量,每个维度测量 24小时,有 (4320/18)*24 = 5760
print(np.array(data).shape) # (18, 5760)
经过上面代码处理之后,Data变为下图
3,parse data to (x, y)
x = []
y = []
# 每 12 个月
for i in range(12):
# 一个月取连续 10 小时的 data 可以有 471 个,一个月有20天,共计 20*24=480,
#然后取连续的10小时,在最后一天的15点以后,构不成连续的10小时,则总共能取480-9=471
for j in range(471):
x.append([])
# 18 种污染物
for t in range(18):
# 连续 9 小时
for s in range(9):
x[471 * i + j].append(data[t][480 * i + j + s])
# 第10行为 pm2.5的数值,前9个小时已放入 x,所以,y放入第10个小时的pm2.5数值,即实际数值
y.append(data[9][480 * i + j + 9])
x = np.array(x)
y = np.array(y)
# print(x.shape) # (5652, 162) # 一个月有471个连续10小时,则12个月有 12*471 = 5652 18*9 = 162
# print(y.shape) # (5652, 1)
# 此处,个人估计应该是测试效果的 y = w1 * x1 + w2 * x1 ** 2,,,之类的
# add square term
# x = np.concatenate((x, x ** 2), axis = 1)
# add bias
x = np.concatenate((np.ones((x.shape[0], 1)), x), axis = 1)
print(x.shape) # (5652, 163) ,此处多了一列,即把 bias 放入了第 0 列
4,init weight & other hyperparams
w = np.zeros(len(x[0]))
# print(w.shape) # (163,)
# print(type(w)) # <class 'numpy.ndarray'>
l_rate = 10 # 学习率
repeat = 10000 # 迭代次数
# print(w.shape) # (163,)
5,start training
x_t = x.transpose()
s_gra = np.zeros(len(x[0]))
for i in range(repeat):
hypo = np.dot(x, w) # (5652, 1)
loss = hypo - y # (5652, 1) 预测数值 - 真实数值
cost = np.sum(loss ** 2) / len(x) # loss 函数
cost_a = math.sqrt(cost) # 这句话和上面三句,共计四句是用来求解 loss 函数数值的
# 梯度下降,思考,当 weight 少时候用 循环一个一个累加可以解决,但是当 weight 特别多时,最好还是矩阵相乘
gra = np.dot(x_t, loss)
s_gra += gra ** 2
ada = np.sqrt(s_gra)
w = w - l_rate * gra / ada
print('iteration : %d | Cost: %f ' % (i+1, cost_a))
6,save/read model
# save model
np.save(r'hw1\model.npy', w)
# read model
w = np.load(r'hw1\model.npy')
print(w.shape)
7,read test data
test_x = []
n_row = 0
text = open(r'hw1\test.csv', 'r')
row = csv.reader(text, delimiter = ',')
for r in row:
if n_row % 18 == 0: # 暗指每次的第一行,也就是 18 的倍数行
test_x.append([])
for i in range(2, 11):
test_x[n_row // 18].append(float(r[i]))
else:
for i in range(2, 11):
if r[i] != 'NR':
test_x[n_row // 18].append(float(r[i]))
else:
test_x[n_row // 18].append(0)
n_row = n_row + 1
text.close()
test_x = np.array(test_x)
print(test_x.shape) # (240, 162) 240指一共有240组数据, 18*9=162
# add square term # 应该是用于测试的,参考 线性回归 课件
# test_x = np.concatenate((test_x, test_x ** 2), axis = 1)
# add bias
test_x = np.concatenate((np.ones((test_x.shape[0], 1)), test_x), axis = 1)
print(test_x.shape) # (240, 163)
8,get predict.csv with your model
ans = []
yy = []
print(w.shape) # (163,)
print(test_x.shape) # (240, 163)
print(test_x[i].shape) # (163,)
for i in range(len(test_x)): # len(test_x) = 240
ans.append(['id_'+str(i)])
a = np.dot(w, test_x[i]) # 在此处,w 和 test_x[i]都是 (163, 1)形式,表现为对应元素相乘再相加
ans[i].append(a)
yy.append(a)
print(np.array(ans).shape) # (240, 2)
# 这一部分是把得出的 pm2.5 的数值写入 predict.csv文件
filename = r'hw1\predict.csv'
text = open(filename, "w+")
s = csv.writer(text, delimiter = ',', lineterminator = '\n')
s.writerow(['id', 'value'])
for i in range(len(ans)):
s.writerow(ans[i])
text.close()
9,plot
row_n = 0
y = []
text = open(r'hw1\ans.csv', 'r', encoding = 'big5')
row = csv.reader(text, delimiter = ',') # 此处 delimiter = ',' 有没有无所谓,这句话用来当一行中用 ','时分割句子的
for r in row:
if row_n != 0:
y.append(r[1])
row_n = row_n + 1
plt.figure(figsize = (13, 7))
plt.plot(np.arange(0, 240, 1), yy, 'r', label = 'prediction pm2.5')
plt.plot(np.arange(0, 240, 1), y, 'b', label = 'ans pm2.5')
plt.legend()
plt.show()