一、基本原理
神经网络(neural networks)模拟生物神经网络的一种机器学习模型。下面以三层神经网络为例具体说明其实现过程:该模型共包括输入层、隐含层和输出层,表示第个神经元的输入,表示第的第个神经元,表示第层到层映射的权值矩阵。·
由以上神经网络,可以计算得到隐含层参数:
输出层参数:
其中,激活函数为:
考虑正则化的代价函数为:
其中表示层数,表示第层的神经元个数,表示输出层有个神经元(这也是与逻辑回归代价函数的主要区别)。
通过以上过程,信息向网络前方流动,能够计算得到代价函数,这称之为前向传播(forward propagation),为了计算梯度,还需要误差反向传播(error BackPropagation, BP)。
以三层神经网络为例,记为第层第个神经元的误差,则,,没有,因为输入没有误差。由于sigmoid激活具有性质,因此可以在前向传播中计算出来,则反向传播误差计算梯度的过程为:对于每组训练样本,正向计算,反向计算误差,则,最后得到代价函数的梯度。其详细推导过程如下图所示,一些变量的表示略有差异,其中表示均方误差(mean square error, MSE)。
二、算法实现
2.1、手动实现
1、导入包和文件
import numpy as np
import scipy.io as spio
import matplotlib.pyplot as plt
from scipy import optimize
import time
datafile = 'data_digits.mat'
2、主函数,实现神经网络的过程
def NeuralNteworks(input_layer_size,hidden_layer_size,output_layer_size):
data_img = spio.loadmat(datafile)
X = data_img['X']
Y = data_img['y']
m,n = X.shape
rand_indices = [t for t in [np.random.randint(x-x,m) for x in range(1000)]]
plot_data(X[rand_indices,:])
Lambda = 1
init_theta_1 = randInitWeights(input_layer_size,hidden_layer_size)
init_theta_2 = randInitWeights(hidden_layer_size,output_layer_size)
init_nn_params = np.vstack((init_theta_1.reshape(-1,1),init_theta_2.reshape(-1,1)))
start = time.time()
result = optimize.fmin_cg(nnCostFunction,init_nn_params,fprime=nnGradient,args=(input_layer_size,hidden_layer_size,output_layer_size,X,Y,Lambda),maxiter=100)
print('run time:',time.time()-start)
print(result)
length = result.shape[0]
theta_1 = result[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1)
theta_2 = result[hidden_layer_size*(input_layer_size+1):length].reshape(output_layer_size,hidden_layer_size+1)
plot_data(theta_1[:,1:length])
plot_data(theta_2[:,1:length])
p = predict(theta_1,theta_2,X)
print('accuracy:{:.2f}%'.format(np.mean(np.float64(p == Y.reshape(-1,1))*100)))
res = np.hstack((p.reshape(-1,1)))
np.savetxt('predict.txt',res,delimiter=',')
2、代价函数,上文的具体实现
def nnCostFunction(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda):
length = nn_params.shape[0]
theta_1 = nn_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1)
theta_2 = nn_params[hidden_layer_size*(input_layer_size+1):length].reshape(num_labels,hidden_layer_size+1)
m = X.shape[0]
class_y = np.zeros((m,num_labels))
for i in range(num_labels):
class_y[:,i] = np.int32(y==i).reshape(1,-1)
theta1_col = theta_1.shape[1]
theta1_x = theta_1[:,1:theta1_col]
theta2_col = theta_2.shape[1]
theta2_x = theta_2[:,1:theta2_col]
temp = np.dot(np.transpose(np.vstack((theta1_x.reshape(-1,1),theta2_x.reshape(-1,1)))),np.vstack((theta1_x.reshape(-1,1),theta2_x.reshape(-1,1))))
a1 = np.hstack((np.ones((m,1)),X))
z2 = np.dot(a1,np.transpose(theta_1))
a2 = sigmoid(z2)
a2 = np.hstack((np.ones((m,1)),a2))
z3 = np.dot(a2,np.transpose(theta_2))
h = sigmoid(z3)
J = -(np.dot(np.transpose(class_y.reshape(-1,1)),np.log(h.reshape(-1,1)))+np.dot(np.transpose(1-class_y.reshape(-1,1)),np.log(1-h.reshape(-1,1))))/m + temp*Lambda/2/m
return np.ravel(J)
3、梯度值的计算,误差反向传递
def nnGradient(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda):
length = nn_params.shape[0]
theta_1 = nn_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1).copy()
theta_2 = nn_params[hidden_layer_size*(input_layer_size+1):length].reshape(num_labels,hidden_layer_size+1).copy()
m = X.shape[0]
class_y = np.zeros((m,num_labels))
for i in range(num_labels):
class_y[:,i] = np.int32(y==i).reshape(1,-1)
theta1_col = theta_1.shape[1]
theta1_x = theta_1[:,1:theta1_col]
theta2_col = theta_2.shape[1]
theta2_x = theta_2[:,1:theta2_col]
theta1_grad = np.zeros((theta_1.shape))
theta2_grad = np.zeros((theta_2.shape))
a1 = np.hstack((np.ones((m,1)),X))
z2 = np.dot(a1,np.transpose(theta_1))
a2 = sigmoid(z2)
a2 = np.hstack((np.ones((m,1)),a2))
z3 = np.dot(a2,np.transpose(theta_2))
h = sigmoid(z3)
delta_3 = np.zeros((m,num_labels))
delta_2 = np.zeros((m,hidden_layer_size))
for i in range(m):
delta_3[i,:] = h[i,:] - class_y[i,:]
theta2_grad += np.dot(np.transpose(delta_3[i,:].reshape(1,-1)),a2[i,:].reshape(1,-1))
delta_2[i,:] = np.dot(delta_3[i,:].reshape(1,-1),theta2_x)*sigmoidGradient(z2[i,:])
theta1_grad += np.dot(np.transpose(delta_2[i,:].reshape(1,-1)),a1[i,:].reshape(1,-1))
theta_1[:,0] = 0
theta_2[:,0] = 0
grad = (np.vstack((theta1_grad.reshape(-1,1),theta2_grad.reshape(-1,1)))+Lambda*np.vstack((theta_1.reshape(-1,1),theta_2.reshape(-1,1))))/m
return np.ravel(grad)
4、辅助函数的实现
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoidGradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitWeights(L_in,L_out):
w = np.zeros((L_out,L_in+1))
init_epsilon = (6.0/(L_out+L_in))**0.5
w = np.random.rand(L_out,1+L_in)*2*init_epsilon - init_epsilon
return w
def checkGradient(Lambda=0):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
init_theta_1 = debugInitWeights(input_layer_size,hidden_layer_size)
init_theta_2 = debugInitWeights(hidden_layer_size,num_labels)
X = debugInitWeights(input_layer_size-1,m)
y = np.transpose(np.mod(np.arange(1,m+1),num_labels))
y = y.reshape(-1,1)
nn_params = np.vstack((init_theta_1.reshape(-1,1),init_theta_2.reshape(-1,1)))
bp_grad = nnGradient(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
num_grad = np.zeros((nn_params.shape[0]))
step = np.zeros((nn_params.shape[0]))
error = 1e-4
for i in range(nn_params.shape[0]):
step[i] = error
loss_1 = nnCostFunction(nn_params-step.reshape(-1,1),input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
loss_2 = nnCostFunction(nn_params+step.reshape(-1,1),input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
num_grad[i] = (loss_2-loss_1)/(2*error)
step[i] = 0
res = np.hstack((num_grad.reshape(-1,1),bp_grad.reshape(-1,1)))
print(res)
def debugInitWeights(fan_in,fan_out):
w = np.zeros((fan_out,fan_in+1))
x = np.arange(1,fan_out*(fan_in+1)+1)
w = np.sin(x).reshape(w.shape)/10
return w
5、绘图函数与预测函数
def plot_data(ImgData):
sum = 0
m,n = ImgData.shape
width = np.int32(np.round(np.sqrt(n)))
height = np.int32(n/width)
rows = np.int32(np.floor(np.sqrt(m)))
cols = np.int32(np.ceil(m/rows))
pad = 1
display_array = -np.ones((pad+rows*(height+pad),pad+cols*(width+pad)))
for i in range(rows):
for j in range(cols):
if sum >= m:
break
display_array[pad+i*(height+pad):pad+i*(height+pad)+height,pad+j*(width+pad):pad+j*(width+pad)+width] = ImgData[sum,:].reshape(height,width,order='F')
sum += 1
if sum >= m:
break
plt.imshow(display_array,cmap='gray')
plt.axis('off')
plt.show()
def predict(theta_1,theta_2,X):
m = X.shape[0]
num_labels = theta_2.shape[0]
X = np.hstack((np.ones((m,1)),X))
h1 = sigmoid(np.dot(X,np.transpose(theta_1)))
h1 = np.hstack((np.ones((m,1)),h1))
h2 = sigmoid(np.dot(h1,np.transpose(theta_2)))
p = np.array(np.where(h2[0,:]==np.max(h2,axis=1)[0]))
for i in np.arange(1,m):
t = np.array(np.where(h2[i,:]==np.max(h2,axis=1)[i]))
p = np.vstack((p,t))
return p
6、调用主函数
if __name__=='__main__':
checkGradient()
NeuralNteworks(400,25,10)
计算结果如下所示,共迭代100次,运行时间90秒,预测准确率98.46%
Warning: Maximum number of iterations has been exceeded.
Current function value: 0.364905
Iterations: 100
Function evaluations: 235
Gradient evaluations: 235
run time: 89.65801429748535
accuracy:98.46%
2.2、调用sklearn包
调入sklearn的neural_network的MLPClassifier模型和make_moons数据集,实现神经网络的分类功能。其中使用了mglearn库进行辅助绘图。
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn
X,y = make_moons(n_samples=200,noise=0.25,random_state=3)
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=3)
mlp = MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[100]).fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('feature 0')
plt.ylabel('feature 1')
print('Accuracy on training set:{:.2f}%'.format(mlp.score(X_train,y_train)*100))
print('Accuracy on test set:{:.2f}%'.format(mlp.score(X_test,y_test)*100))
在只有一层隐含层100个神经元,优化方法为'lbfgs',激活函数默认为'relu'的情况下的计算结果与分类边界:
Accuracy on training set:100.00%
Accuracy on test set:96.00%
修改参数,在有两层隐含层10*10个神经元,优化方法为'lbfgs',激活函数为'logistic'(即sigmiod)的情况下的计算结果与分类边界:
mlp = MLPClassifier(solver='lbfgs',activation='logistic',random_state=0,hidden_layer_sizes=[10,10]).fit(X_train,y_train)
Accuracy on training set:99.33%
Accuracy on test set:92.00%
三、问题探讨
3.1、经验风险最小与结构风险最小
大部分机器学习问题都可以归结为经验风险(损失函数)和结构风险(损失函数+正则化)函数的优化问题。
经验风险是损失函数在数据集上的期望,可以由一组样本损失函数的均值来定义:
这样根据风险最小,可以计算参数
其中是样本集合,是学习模型的参数。
前边也有所提及,损失函数一般有两种角度定义:
1.误差函数(Error Function, ERF),常见的有均方误差(mean-squre error, MSE)和均绝对值差(mean-absolute error, MAE)
2.负对数似然函数(Negative Log Likelihood, NLL),即
二者关联是:最小平方误差等价于残差符合高斯分布的最小NLL,最小绝对值误差等价于残差符合拉普拉斯分布下的最小NLL。
经验风险没有考虑模型学习能力和数据的匹配度。在经验风险最小的同时,兼顾平衡模型的学习能力与数据的匹配,避免出现过拟合的新目标,就是结构风险最小(Structural Risk Minimization)。
参考资料
[1] https://github.com/lawlite19/MachineLearning_Python
[2] 周志华 著. 机器学习. 北京:清华大学出版社,2016
[3] 李航 著. 统计学习方法. 北京:清华大学出版社,2012
[4] 史春奇等 著. 机器学习算法背后的理论与优化. 北京:清华大学出版社,2019
[5] Peter Harrington 著. 李锐等 译. 机器学习实战. 北京:人民邮电出版社,2013
[6] Ian Goodfellow等 著. 赵申剑等 译. 深度学习. 北京:人民邮电出版社, 2017
为天地立心,为生民立命,为往圣继绝学,为万世开太平。——张载《横渠四句》