Работающая программа
# From https://www.kaggle.com/code/freacle/part-1
# В исходной статье https://habr.com/ru/articles/869118/
# Очень много кусков программ с объяснениями
import numpy as np
Parameter = None
def ParameterObj():
class Parameter:
layers = []
calling = dict()
def __init__(self, info):
Parameter.layers.append(info[0])
Parameter.calling[info[0]] = info[1:]
return Parameter
class Module:
def __init__(self):
self._constructor_Parameter = ParameterObj()
global Parameter
Parameter = self._constructor_Parameter
def forward(self):
pass
def __call__(self, x):
return self.forward(x)
def parameters(self):
return self
class Linear:
def __init__(self, input_channels: int, output_channels: int, bias = True):
self.input_channels = input_channels
self.output_channels = output_channels
self.bias = bias
self.backward_list = []
if bias:
Parameter([self, np.random.uniform(- 0.5, 0.5, size=(self.input_channels, self.output_channels)), np.random.uniform(- 0.5, 0.5, size=self.output_channels)])
else:
Parameter([self, np.random.uniform(- 0.5, 0.5, size=(self.input_channels, self.output_channels)), np.zeros(self.output_channels)])
def __call__(self, x):
self.x = np.array(x, copy=True)
result = x @ Parameter.calling[self][0] + Parameter.calling[self][1]
return result
def backward(self, input_matrix):
x_gradient = input_matrix @ self.weight.T
self.weight_gradient = self.x.T @ input_matrix
self.bias_gradient = input_matrix.mean(axis=0)
return x_gradient
class Flatten:
def __init__(self):
pass
def __call__(self, x):
return x.reshape(1, -1)
class ReLU:
def __init__(self):
pass
def __call__(self, x):
self.x = x
return np.maximum(0, x)
def backward(self, input_matrix):
return (self.x > 0) * input_matrix
class Softmax():
def __init__(self):
pass
def __call__(self, z):
return np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1, 1)
class CrossEntropyLoss:
def __init__(self):
self.predicted = None
self.true = None
def __call__(self, logits, true):
predicted = np.exp(logits) / np.sum(np.exp(logits), axis=1).reshape(-1, 1) # softmax
self.predicted = np.array(predicted, copy=True) # сделаем копию входных матрицы для дальнейших вычислений
self.true = np.array(true, copy=True) # сделаем копию входных матрицы для дальнейших вычислений
number_of_classes = predicted.shape[1] # получим количество классов, в нашем случае 2
self.true = np.array(true, copy=True)
# вычисляем значение лосс-функции прямо по формуле
self.loss = -1 * np.sum(true * np.log(predicted + 1e-5), axis=1)
return self
def backward(self):
loss = self.predicted - self.true
# Итерируем по каждому слою в обратном порәдке, благодаря тому, что мы всё сохранили в Parameter.layers
for index, layer in enumerate(Parameter.layers[::-1]):
if type(layer).__name__ == 'Linear':
changes_w = (layer.x.T @ loss) / loss.shape[0]
# нормировка на loss.shape[0] нужна, так как величина изменений зависит от размера батча
if layer.bias:
changes_b = (np.sum(loss) / loss.shape[0])
else:
changes_b = 0
layer.backward_list = [changes_w, changes_b]
# Cчитаем градиент для следующих слоев
loss = loss @ Parameter.calling[layer][0].T
elif type(layer).__name__ == 'ReLU':
loss = layer.backward(loss)
class SGD:
def __init__(self, model, learning_rate):
self.model = model
self.lr = learning_rate
def step(self):
for index, layer in enumerate(self.model._constructor_Parameter.layers[::-1]):
if type(layer).__name__ == 'Linear':
weight, bias = self.model._constructor_Parameter.calling[layer]
weight_gradient, bias_gradient = layer.backward_list[0], layer.backward_list[1]
self.model._constructor_Parameter.calling[layer] = [weight - lr * weight_gradient, bias - lr * bias_gradient]
class SimpleNet(Module):
def __init__(self):
super().__init__()
self.linear1 = Linear(input_channels=25, output_channels=10, bias=True)
self.linear2 = Linear(input_channels=10, output_channels=2, bias=True)
self.flatten = Flatten()
self.relu = ReLU()
self.softmax = Softmax()
def forward(self, x):
x_1 = self.flatten(x)
x_2 = self.linear1(x_1)
x_3 = self.relu(x_2)
x_4 = self.linear2(x_3)
return x_4
input_x = np.array([[ 0.99197708, -0.77980023, -0.8391331 , -0.41970686, 0.72636492],
[ 0.85901409, -0.22374584, -1.95850625, -0.81685145, 0.96359871],
[-0.42707937, -0.50053309, 0.34049477, 0.62106931, -0.76039365],
[ 0.34206742, 2.15131285, 0.80851759, 0.28673013, 0.84706839],
[-1.70231094, 0.36473216, 0.33631525, -0.92515589, -2.57602677]])
target_x = np.array([[1, 0]])
loss_fn = CrossEntropyLoss()
model = SimpleNet()
optim = SGD(model.parameters(), learning_rate = 0.01)
for i in range(100):
output = model(input_x)
loss = loss_fn(output, target_x)
loss.backward()
lr = 0.01
optim.step()
if (i % 20) == 0:
print(loss.loss, i)
С целью сокращения была доверена ИИ для оптимизации. После выкидывания ИИ всех комментарием и нескольких изменений, текст был получен без окончания, содержащего использование.
import numpy as np
class Parameter:
layers = []
calling = {}
def __init__(self, info):
self.layers.append(info[0])
self.calling[info[0]] = info[1:]
class Module:
def __init__(self):
self.Parameter = Parameter
def forward(self):
pass
def __call__(self, x):
return self.forward(x)
def parameters(self):
return self
class Linear(Module):
def __init__(self, input_channels: int, output_channels: int, bias=True):
self.input_channels = input_channels
self.output_channels = output_channels
self.bias = bias
self.backward_list = []
if bias:
self.Parameter([self, np.random.uniform(-0.5, 0.5, size=(input_channels, output_channels)), np.random.uniform(-0.5, 0.5, size=output_channels)])
else:
self.Parameter([self, np.random.uniform(-0.5, 0.5, size=(input_channels, output_channels)), np.zeros(output_channels)])
def __call__(self, x):
self.x = np.array(x, copy=True)
result = x @ self.Parameter.calling[self][0] + self.Parameter.calling[self][1]
return result
def backward(self, input_matrix):
x_gradient = input_matrix @ self.Parameter.calling[self][0].T
self.backward_list = [input_matrix.T @ self.x, input_matrix.mean(axis=0)]
return x_gradient
class Flatten(Module):
def __call__(self, x):
return x.reshape(x.shape[0], -1)
class ReLU(Module):
def __call__(self, x):
self.x = x
return np.maximum(0, x)
def backward(self, input_matrix):
return (self.x > 0) * input_matrix
class Softmax(Module):
def __call__(self, z):
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=1, keepdims=True)
class CrossEntropyLoss(Module):
def __call__(self, logits, true):
self.predicted = self.Softmax()(logits)
self.true = np.array(true, copy=True)
self.loss = -np.sum(true * np.log(self.predicted + 1e-5), axis=1)
return self
def backward(self):
loss = self.predicted - self.true
for layer in self.Parameter.layers[::-1]:
if isinstance(layer, Linear):
layer.backward_list[0] = (layer.x.T @ loss) / loss.shape[0]
if layer.bias:
layer.backward_list[1] = np.sum(loss, axis=0) / loss.shape[0]
loss = loss @ layer.Parameter.calling[layer][0].T
elif isinstance(layer, ReLU):
loss = layer.backward(loss)
class SGD:
def __init__(self, model, learning_rate):
self.model = model
self.lr = learning_rate
def step(self):
for layer in self.model.Parameter.layers[::-1]:
if isinstance(layer, Linear):
weight, bias = layer.Parameter.calling[layer]
weight_gradient, bias_gradient = layer.backward_list
layer.Parameter.calling[layer] = [weight - self.lr * weight_gradient, bias - self.lr * bias_gradient]
class SimpleNet(Module):
def __init__(self):
super().__init__()
self.linear1 = Linear(input_channels=25, output_channels=10, bias=True)
self.linear2 = Linear(input_channels=10, output_channels=2, bias=True)
self.flatten = Flatten()
self.relu = ReLU()
self.softmax = Softmax()
def forward(self, x):
x_1 = self.flatten(x)
x_2 = self.linear1(x_1)
x_3 = self.relu(x_2)
x_4 = self.linear2(x_3)
return x_4
input_x = np.array([[ 0.99197708, -0.77980023, -0.8391331 , -0.41970686, 0.72636492],
[ 0.85901409, -0.22374584, -1.95850625, -0.81685145, 0.96359871],
[-0.42707937, -0.50053309, 0.34049477, 0.62106931, -0.76039365],
[ 0.34206742, 2.15131285, 0.80851759, 0.28673013, 0.84706839],
[-1.70231094, 0.36473216, 0.33631525, -0.92515589, -2.57602677]])
target_x = np.array([[1, 0]])
loss_fn = CrossEntropyLoss()
model = SimpleNet()
optim = SGD(model.parameters(), learning_rate = 0.01)
for i in range(100):
output = model(input_x)
loss = loss_fn(output, target_x)
loss.backward()
lr = 0.01
optim.step()
if (i % 20) == 0:
print(loss.loss, i)
Текст без окончания проходил без ошибок, но и без выдачи результатов. Добавление прежнего окончания привело к ошибкам.