我正在用Python从头开始编写一个NLP模型,大部分函数只使用NumPy。
import numpy as np
# my loss and activation functions
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(x > 0, 1, 0)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def softmax(x):
exp = np.exp(x)
return exp / np.sum(exp, axis=1, keepdims=True)
def softmax_prime(x):
return softmax(x) * (1 - softmax(x))
def cross_entropy(y, y_hat):
return -np.sum(y * np.log(y_hat + 1e-8))
def cross_entropy_prime(y, y_hat):
return y - y_hat
def mse(y, y_hat):
return np.mean((y - y_hat) ** 2)
def mse_prime(y, y_hat):
return 2 * (y_hat - y) / y.size
我的NLP模型在训练过程中使用反向传播进行权重调整。在每一层中,我存储了输入向量,激活前后的输出向量,以及梯度向量,以便于访问。
class Layer:
def __init__(self, n_input, n_neurons, activation=relu, activation_prime=relu):
self.weights = np.random.randn(n_input, n_neurons)
self.biases = np.random.randn(n_neurons)
self.activation = activation
self.activation_prime = activation_prime
def forward(self, inputs):
self.inputs = inputs
self.z = np.dot(inputs, self.weights) + self.biases
self.output = self.activation(self.z)
return self.output
def backward(self, dvalues):
self.dz = dvalues * self.activation_prime(self.z)
self.dinputs = self.dz.dot(self.weights.T)
self.dweights = self.inputs.T.dot(self.dz)
self.dbiases = np.sum(self.dz, axis=0)
return self.dinputs
def update(self, learning_rate):
self.weights -= self.dweights * learning_rate
self.biases -= self.dbiases * learning_rate
class Model:
def __init__(self):
self.layers = []
def add(self, layer):
self.layers.append(layer)
def forward(self, inputs):
for layer in self.layers:
inputs = layer.forward(inputs)
return inputs
def backward(self, dvalues):
for layer in reversed(self.layers):
dvalues = layer.backward(dvalues)
def update(self, learning_rate):
for layer in self.layers:
layer.update(learning_rate)
def predict(self, inputs):
return self.forward(inputs)
def evaluate(self, X, Y):
predictions = self.predict(X)
return np.mean(np.argmax(predictions, axis=1) == np.argmax(Y, axis=1))
def compile(self, loss, loss_prime, learning_rate=0.01):
self.loss = loss
self.loss_prime = loss_prime
self.learning_rate = learning_rate
def fit(self, X, Y, epochs=100):
loss = []
for i in range(epochs):
outputs = self.forward(X)
loss.append(self.loss(Y, outputs))
dvalues = self.loss_prime(Y, outputs)
self.backward(dvalues)
self.update(self.learning_rate)
print(f"Epoch {i}: {loss[-1]}")
return loss
我在我的NLP模型中使用了sklearn的iris数据集,在那里我对输入特征进行了归一化,并对目标标签使用了one-hot编码。
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
Y = iris.target
y = np.zeros((X.shape[0], 3))
y[np.arange(X.shape[0]), Y] = 1
Y = y
到目前为止,当我在输出层使用sigmoid激活函数并使用均方误差(MSE)作为损失函数时,该模型表现良好,但不是特别好。
model = Model()
model.add(Layer(4, 5))
model.add(Layer(5, 6))
model.add(Layer(6, 3, activation=sigmoid, activation_prime=sigmoid_prime))
model.compile(loss=mse, loss_prime=mse_prime, learning_rate=0.004)
loss = model.fit(X, Y, epochs=20000)
plt.plot(loss)
# Last epoch "Epoch 19999: 0.12373022229717626"
evalutaion = model.evaluate(X, Y)
print(evalutaion) # 0.6666666666666666
链接到损失图https://i.stack.imgur.com/to0Sy.png
然而,当我尝试在输出层使用softmax激活函数并将交叉熵作为损失函数时,我无法获得良好的结果。
model2 = Model()
model2.add(Layer(4, 5))
model2.add(Layer(5, 6))
model2.add(Layer(6, 3, activation=softmax, activation_prime=softmax_prime))
model2.compile(cross_entropy, cross_entropy_prime, learning_rate=0.00001)
loss = model2.fit(X, Y, epochs=300)
plt.plot(loss)
# Last epoch "Epoch 299: 1112.783115819416"
print(model2.evaluate(X, Y)) # 0.08
链接到损失图https://i.stack.imgur.com/OT7xG.png
我想知道是否有人能帮我弄清楚为什么会发生这种情况,以及我如何解决它。谢谢。
我读过很多文章,主要是关于Medium和Stack Exchange的,也在纸上画过网络,无数次推导出反向传播。我也看过其他人如何使用softmax和交叉熵。多亏了这些指导,我的NLP模型在sigmoid激活和均方误差损失下运行得很好。但是,我现在卡住了,希望它在softmax激活和交叉熵损失下运行得很好。
1条答案
按热度按时间kxxlusnw1#
谢谢你,hobbs,发现了这个。显然问题很简单:
cross_entropy_prime
函数的语句错误。函数应如下所示:错误的符号导致梯度下降将权重和偏差从最小值移开,因此,损失在某个时候爆炸。