CNN的实现与理解
0 写在前
CNN做为经典机器学习的算法之一,是深度学习的基础.为此写了这篇博客,以记录学习中的所得.
1 训练集与训练标签的设置
本文适用于train为[None,X],其中None为样本,X为样本属性大小.labels为热编码后的标签.并使用mnist数据集.数据读取:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data',one_hot = True)
# 原始数据占位
data = mnist.test.images
labels = mnist.test.labels
2 **函数
对于该算法,**函数是必不可少的,其作用在于,将线性变换进行非线性转化.使得学习更具有泛化能力.本节有relu函数,sigmoid函数,softmax函数,tanh函数
def sigmoid(x):
return 1.0/(1+np.exp(-x))
def relu(x):
return (np.abs(x) + x) / 2
def tanh(x):
return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
def softmax(X):
orig_shape = X.shape
if len(X.shape)>1:
X -= np.max(X,axis = 1,keepdims = True)
a = np.exp(X)
b = np.sum(np.exp(X),axis=1)
for i in range(len(b)):
X[i]=a[i]/b[i]
else:
X -=np.max(X,axis=0,keepdims=True)
X = np.exp(X)/np.sum(np.exp(X),axis=0)
assert X.shape == orig_shape
return X
3 前向传播
其参数设置首先确定网络结构,并初始化权值参数,及代码如下:
def weight_bias(layerdims):
W = {}
b = {}
for i in range(1,len(layerdims)):
W['W'+str(i)] = np.random.randn(layerdims[i-1],layerdims[i])
b['b'+str(i)] = np.random.randn(layerdims[i],)
return W,b
# 前向传播
def forword(data,Weight,bias,layerdims,activation):
# 非线性输出
H = {}
H['H0'] = data
# 线性输出
Z = {}
for i in range(1,len(layerdims)):
#
Z['Z'+str(i)] = np.dot(H['H'+str(i-1)],Weight['W'+str(i)])+bias['b'+str(i)]
exec("H['H' + str(i)] = " + activation[i-1] + "(Z['Z' + str(i)])")
return Z,H
损失函数
其中m对应一个样本的属性多少.N为样本量.
其实现代码如下:
def loss_function(H,labels):
lens = len(H)
n = labels.shape[0]
m = labels.shape[1]
H_end = H['H'+str(lens-1)]
y_ = H_end
loss = -np.sum(np.sum(labels*np.log(y_)+(1-labels)*np.log(1-y_),axis = 1))/(m*n)
return loss
反向传播
为了方便观看,我们把之前整个的一个前向传播的过程复制过来:
同时,把损失函数也弄过来:
注意,下面有的式子,为了直观没有写出来对矩阵求导的转置,写代码的时候需要注意这一点
首先第一件事是对进行求导
为什么求这个式子,当然是为了链式相乘了,然后我们开始对参数w和b进行求导:
到这里我们完成了对w3和b3这两个参数进行求导,后面的基本上类似,就是运用链式求导的法则一层层往前求即可,我们我们继续向下写。
对于w1和b1也是一样的
ok 我们完成了所有的反向求导,接下来我们终于可以愉快的写代码了!
# 反向传播
def backward_propagation(X, labels, weight, bias, H, activation):
m = X.shape[0]
gradients = {}
L = len(weight)
## 计算导数
gradients['dZ' + str(L)] = H['H' + str(L)] - labels
gradients['dW' + str(L)] = 1./m * np.dot( H['H' + str(L-1)].T,gradients['dZ' + str(L)])
gradients['db' + str(L)] = 1./m * np.sum(gradients['dZ' + str(L)].T, axis=1, keepdims = True)
for l in range(L-1, 0, -1):
gradients['dH' + str(l)] = np.dot(gradients['dZ'+str(l+1)],weight['W'+str(l+1)].T)
if activation[l-1] == 'relu':
gradients['dZ'+str(l)] = np.multiply(gradients['dH'+str(l)], np.int64(H['H'+str(l)] > 0))
elif activation[l-1] == 'tanh':
gradients['dZ'+str(l)] = np.multiply(gradients['dH'+str(l)], 1 - np.power(H['H'+str(l)], 2))
elif activation[l-1] == 'sigmoid':
gradients['dZ'+str(l)] = np.multiply(gradients['dH'+str(l)], sigmoid_gard(H['H'+str(l)]))
gradients['dW'+str(l)] = 1./m * np.dot(H['H' + str(l-1)].T,gradients['dZ' + str(l)])
gradients['db'+str(l)] = 1./m * np.sum(gradients['dZ' + str(l)].T, axis=1, keepdims = True)
return gradients
本代码先计算最后一层的损失导数,然后递推得到前面每一层的参数导数.最后就是参数更新了
# 参数更新
def updata_parameters(weight, bias, gradients, lr):
## 更新参数,lr 为 learning rate,是代表参数的学习率
## 太小会使网络收敛很慢,太大可能会使网络在最低点附近徘徊而不会收敛
for i in range(1, len(weight)+1):
weight['W'+str(i)] -= lr * gradients['dW'+str(i)]
bias['b'+str(i)] -= lr * gradients['db'+str(i)][0]
return weight, bias
模型训练
def nn_fit(X,Y, Weight, bias, activation, lr, lambd=0.7, num_iterations=5000, print_cost=[True, 100]):
## num_iteration 是迭代的次数,print_cost,可以在每几次迭代后打印成本。
## 每次迭代都会按「前向传播,计算成本,计算梯度,更新梯度」的顺序执行
for i in range(num_iterations):
Z,H = forword(X,Weight, bias,layerdims,activation)
cost = loss_function(H,Y)
grads = backward_propagation(X, Y, Weight, bias, H, activation)
Weight, b = updata_parameters(Weight, bias, grads, lr)
if print_cost[0] and i % print_cost[1] == 0:
print("Cost after iteration %i: %f" % (i, cost))
return Weight, bias
模型测试
def cls_predict(X, Weight, bias, activation):
## 输出大于 0.5 的视为 1
Z,H = forword(X,Weight, bias,layerdims,activation)
prediction = (H['H'+str(len(H)-1)] > 0.5)
return prediction
具体例子1:
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
import keras
# In[2]:
# 准备数据
def load_data():
# 训练样本有 300 个,测试样本有 100 个
train_X, train_Y = make_circles(n_samples=6600, noise=.02)
test_X, test_Y = make_circles(n_samples=1200, noise=.02)
# 可视化数据
plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
train_X = train_X.T
train_Y = train_Y.reshape((1, train_Y.shape[0]))
test_X = test_X.T
test_Y = test_Y.reshape((1, test_Y.shape[0]))
return train_X, train_Y, test_X, test_Y
train_X, train_Y, test_X, test_Y = load_data()
X = train_X.T
Y = keras.utils.to_categorical(train_Y, 2)[0]
activation = ['relu','relu','sigmoid']
layerdims = [2,18,7,2]
Weight, bias = weight_bias(layerdims)
Weight, bias = nn_fit(X,Y, Weight, bias, activation, lr=0.1, lambd=0.7, num_iterations=5000, print_cost=[True, 100])
X1 = test_X.T
Y1 = keras.utils.to_categorical(test_Y, 2)[0]
prediction = cls_predict(X1, Weight, bias, activation)
accuracy = np.mean((prediction == Y1),dtype=np.float64)
print(accuracy)
具体例子2:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data',one_hot = True)
train_X, train_Y, test_X, test_Y = mnist.train.images,mnist.train.labels,mnist.validation.images,mnist.validation.labels
activation = ['sigmoid','sigmoid','sigmoid']
layerdims = [784,256,64,10]
Weight, bias = weight_bias(layerdims)
Weight, bias = nn_fit(train_X, train_Y, Weight, bias, activation, lr=0.05, lambd=0.2, num_iterations=2000, print_cost=[True, 50])
prediction = cls_predict(test_X, Weight, bias, activation)
accuracy = np.mean((prediction== test_Y),dtype=np.float64)
print(accuracy)