一文读懂深度学习

import random
import numpy as np
### Miscellaneous functions
def sigmoid(z):      #The sigmoid function
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):    #Derivative of sigmoid function
    return sigmoid(z)*(1-sigmoid(z))

class Network(object):
    
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        np.random.seed(3) 
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        """Return the output of the network if 'a' is input."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):
        """Train the neural network using mini-batch stochastic
        gradient descent.  The 'training_data' is a list of tuples
        '(x, y)' representing the training inputs and the desired
        outputs.  The other non-optional parameters are
        self-explanatory.  If 'test_data' is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out.  This is useful for
        tracking progress, but slows things down substantially."""
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]
            
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(
                    j, self.evaluate(test_data), n_test))
            else:
                print("Epoch {0} complete".format(j))

    def update_mini_batch(self, mini_batch, eta):
        """Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The 'mini_batch' is a list of tuples '(x, y), and 'eta'
        is the learning rate."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        for x, y in mini_batch:
            
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    
    def evaluate(self, test_data):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives \partial C_x /
        \partial a for the output activations."""
        return (output_activations-y)

# net = Network([4, 5, 3])
# print('biases:',net.biases)
# print('weights',net.weights)

拆分代码去理解他的意思！ Let's Go!

import random
import numpy as np
### Miscellaneous functions
def sigmoid(z):      #The sigmoid function
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):    #Derivative of sigmoid function
    return sigmoid(z)*(1-sigmoid(z))

这一段代码就非常简单了，导入基础包，然后定义好 sigmoid函数跟他的倒数公式，比较简单，自己求一遍，或者百度一下求导过程都很好理解。这里就不在赘述。

class Network(object):
    
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        np.random.seed(3) 
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

这一段是先设计好类以及类里面的变量：
size 是一个list 类型的值，指的是每一层网络的节点数：
比如我设置size=[3,6,3] 指的就是input layer有三个节点，hidden layer1 有6个节点，output layer 有三个节点
在通俗一点就是输入了三个特征，输出了一个三行的矩阵。

num_layers 顾名思义就是有多少层的意思，显然我上个例子有三层。
random.seed(3)就是设置一下随机种子，当然也可以不是3是其他数。
biases指的是偏差向量，就相当于线性回归中的 Wx+b 中的b，他是一个列向量。
weights 就是权重向量。weight矩阵行列数的设置都是根据每层网络的节点书设定的。
参数都是用random.randn随机出来的
可以参考NG的内容（对于参数矩阵的维度的问题）：

image.png

W:(当前节点数，前一层节点数)
b: (当前节点数，1）
这一点也比较好理解，自己画个神经网络图，然后算一下行列数都能得到。实在不懂的话，看NG的视频吧。
主要代码那里，biases就是除去输入层，剩下的层都是（该层的节点数，1）
weights那里，x, y in zip(sizes[:-1], sizes[1:])] 这里的顺序是x,y 。实际的矩阵是np.random.randn(y, x)。

x,y的顺序又反过来了。

    def feedforward(self, a):
        """Return the output of the network if 'a' is input."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

这里就是简单的前向传输，有多少层的参数，就循环多少层，这个很好理解。
然后一直更新a的值。

 def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):
        """Train the neural network using mini-batch stochastic
        gradient descent.  The 'training_data' is a list of tuples
        '(x, y)' representing the training inputs and the desired
        outputs.  The other non-optional parameters are
        self-explanatory.  If 'test_data' is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out.  This is useful for
        tracking progress, but slows things down substantially."""
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]
            
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(
                    j, self.evaluate(test_data), n_test))
            else:
                print("Epoch {0} complete".format(j))

Epoch就是我重复以下动作做循环。
这一段代码，相当于把数据shuffle一下，然后分成一个个mini_batches，通俗点就是洗牌后分成一个个数据块。
mini_batches是数据块的集合。
在每个小mini_batch中，去做更新参数的动作。
专业术语这个叫 mini_batch SGD。

def update_mini_batch(self, mini_batch, eta):
        """Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The 'mini_batch' is a list of tuples '(x, y), and 'eta'
        is the learning rate."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        for x, y in mini_batch:
            
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

这一段代码，乍一看有点绕，慢慢解读。
首先创建了零矩阵，对应着biases 和 weights的shape。
创建这几个零矩阵的意义就是存放每一个mini_batch中的单位数据更新后的参数，怎么说呢，就是对于一个单位数据，我计算了一个导数，然后一直不断叠加这些导数，最后不断叠加的这个导数就是我去更新参数的导数。
eta/len(mini_batch) 是因为batch里n个数据集，既然这个叠加导数是经过叠加的，所以最后更新的时候也要处于叠加的个数。
不过我觉得不除于n问题应该也不大，感兴趣的可以试试。

    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta  
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

接下来是最有意思的部分，反向传播部分，这里是深度学习的关键，核心。
主要知识点有偏导数，链式法则，损失函数这些。
从这里就可以明确的看出我们的网络的函数是什么。
用一张NG的图像概括：

image.png

就是一个节点包含两层，一层线性变化，加一层非线性变化。
线性变化可以理解为简单的线性回归。
非线性变化是sigmoid函数。

同样的先创建同样参数矩阵的零向量，用来保存导数。
actions 包含了输入的特征，以及每一层网络，sigmoid函数出来的值。
zs包含了每一个层网络，线性回归出来的值。
求出来每一层的结果后。
我们就需要开始计算导数了。
这里主要用的就是直接求偏导，硬求。
用误差乘以 sigmoid的导数，在乘以线性回归的导数...

image.png

之所以用误差乘，是因为，如果误差很大我们相对于这个参数进行比较大的改变，反之，如果误差很小，我们相对这个参数进行小小的改动即可，这就是为什么乘上误差。
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
注意这一个代码，为什么就变成乘以weights，我在上图也写到了，因为我们计算上一层的时候，我们是要对上一层输出求偏导，然后由此链式上了，在对上一层的weights和biases求偏导。
以及这里weights和delta变换位置也挺关键的。
这里分了计算输出层，和隐藏层的偏导。
在细致一点：

        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

这里对输出层求偏导，也就是对于实际值与预测值的误差，乘上偏导求的，这一段比较好理解。

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta  
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())

这里是对于其他隐藏层求偏导，也很好理解，乘上weights是对前一层的输出求导，然后*sp就是对于sigmoid函数求导。
然后biases就等于这个，weights还要再乘以在前一层的输出。

def evaluate(self, test_data):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

这里是对现有模型的评估，这里非常有意思的是，对于输出出来的结果，我们用argmax函数来求出最大值的下标，
通俗一点的理解就是，在输出这个list中，（可以看成一个list）每一个值代表着对于每一个类的预测，
比如我总共有预测的是三个类。
感觉说不清楚，还是上数据集来说。

image.png

在这个iris数据集中，我的输入是前四个特征，sepl，sepw，petl，petw，然后有三个类别，分别是setosa，versicolor，virginica。这三个类别对应着输出是[1,0,0],[0,1,0],[0,0,1]。所以我们的预测输出大概是长这个样：

image.png

所以我们要取最大概率的下标来跟y计算，当然这里的y也是经过转化的。很好理解。
如果你的y不是一个数，也跟测试集一样是一个列向量，你也可以在y的前面加上argmax

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives \partial C_x /
        \partial a for the output activations."""
        return (output_activations-y)

这个代码就很简单了，就是计算真实值与预测值的误差。

我用的是irisdata。
整理完的数据集大概长这样：

image.png

就截取两个数据集吧。
这里的数据集都是列向量。别搞错了。

我用的是每一个特征抽取30个数据作为训练集，然后用全部数据作为测试集。

设置的参数：

image.png

预测正确的结果还不错：148
准确率的图像：

image.png

整体过程大概就是这样，神经网络真的很高效说实话，预测的很准。
最后感谢大神写的代码，我也只是拿过来借鉴而已。
地址：https://github.com/MichalDanielDobrzanski/DeepLearningPython35/blob/master/network.py

接下来，要去看其他模型来学习咯！加油！

最后编辑于：2020.03.24 22:18:43

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 206,839评论 6赞 482
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 88,543评论 2赞 382
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 153,116评论 0赞 344
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 55,371评论 1赞 279
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 64,384评论 5赞 374
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 49,111评论 1赞 285
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,416评论 3赞 400
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 37,053评论 0赞 259
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 43,558评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 36,007评论 2赞 325
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,117评论 1赞 334
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,756评论 4赞 324
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,324评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,315评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,539评论 1赞 262
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,578评论 2赞 355
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,877评论 2赞 345

一文读懂深度学习

拆分代码去理解他的意思！ Let's Go!

x,y的顺序又反过来了。

推荐阅读更多精彩内容