I am writing a neural network to identify digits from the MNIST database. It's primarily based on this code https://github.com/SebLague/Neural-Network-Experiments
My neural network appears to be encountering issues with backpropagation. While it succeeded in learning certain digits like 0, it fails at all the other digits. I've made a graph of the accuracy of the model based on some testing data, and it seems to get stuck around 40% accuracy while simultaneously fluctuating a large amount. It looks something like this:
Graph of accuracy based on pass #:

Here is the layer code and network code:
import numpy as np
def ReLU_calculate(inputs):
return np.maximum(0, inputs)
def Softmax_calculate(inputs):
exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
return probabilities
def ReLU_derivative(input):
if input > 0:
return 1
return 0
def Softmax_derivative(inputs, index):
expSum = 0
for input in inputs:
expSum = expSum + np.exp(input)
ex = np.exp(inputs[index])
return (ex * expSum - ex * ex) / (expSum * expSum)
def cost_derivative(output, y):
if output == 0 or output == 1:
return 0
else:
return (-1 * output + y) / (output * (output - 1))
class Layer:
def __init__(self, numNodesIn, numNodes):
self.weights = 0.1 * np.random.randn(numNodesIn, numNodes)
self.biases = np.zeros((1, numNodes))
self.inputs = None
self.weighted_inputs = None
self.activations = None
self.nodes_in = numNodesIn
self.nodes_out = numNodes
self.cost_gradientW = None
self.cost_gradientB = None
def forward(self, inputs):
self.inputs = inputs
self.weighted_inputs = np.dot(inputs, self.weights) + self.biases
def hidden_activation(self):
self.activations = ReLU_calculate(self.weighted_inputs)
def output_activation(self):
self.activations = Softmax_calculate(self.weighted_inputs)
def CalculateOutputLayerNodeValues(self, data, expectedOutputs):
nodeValues = []
for i in range(data.batch_size):
current_nodeValues = []
for x in range(len(expectedOutputs[i])):
costDerivative = cost_derivative(self.activations[i][x], expectedOutputs[i][x])
activationDerivative = Softmax_derivative(self.weighted_inputs[i], x)
current_nodeValues.append(costDerivative * activationDerivative)
nodeValues.append(current_nodeValues)
return nodeValues
def UpdateGradients(self, data, nodeValues):
cost_gradientW = [[0 for x in range(self.nodes_out)] for j in range(self.nodes_in)]
cost_gradientB = [0 for x in range(self.nodes_out)]
for i in range(data.batch_size):
for nodeOut in range(self.nodes_out):
nodeValue = nodeValues[i][nodeOut]
for nodeIn in range(self.nodes_in):
derivativeCostWrtWeight = self.inputs[i][nodeIn] * nodeValue
cost_gradientW[nodeIn][nodeOut] += derivativeCostWrtWeight
derivativeCostWrtBias = 1 * nodeValues[i][nodeOut]
cost_gradientB[nodeOut] += derivativeCostWrtBias
self.cost_gradientW = cost_gradientW
self.cost_gradientB = cost_gradientB
def CalculateHiddenLayerNodeValues(self, data, oldLayer, oldNodeValues):
newNodeValues = []
for batch in range(data.batch_size):
smallerNodeValues = []
for i in range(self.nodes_out):
newNodeValue = 0
for j in range(len(oldNodeValues)):
newNodeValue += oldLayer.weights[i][j] * oldNodeValues[batch][j]
newNodeValue = newNodeValue * ReLU_derivative(self.weighted_inputs[batch][i])
smallerNodeValues.append(newNodeValue)
newNodeValues.append(smallerNodeValues)
return newNodeValues
def ApplyGradients(self, learnRate):
for nodeOut in range(self.nodes_out):
for nodeIn in range(self.nodes_in):
self.weights[nodeIn][nodeOut] -= self.cost_gradientW[nodeIn][nodeOut] * learnRate
self.biases[0][nodeOut] -= self.cost_gradientB[nodeOut] * learnRate
self.cost_gradientB = None
self.cost_gradientW = None
Layer code:
import numpy as np
import matplotlib.pyplot as plt
from Layer import Layer
from keras.datasets import mnist
class Loss:
def calculate(self, output, y):
sample_losses = self.forward(output, y)
data_loss = np.mean(sample_losses)
return data_loss
class Loss_CategoricalCrossentropy(Loss):
def forward(self, y_pred, y_true):
samples = len(y_pred)
y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
if len(y_true.shape) == 1:
correct_confidences = y_pred_clipped[range(samples), y_true]
elif len(y_true.shape) == 2:
correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
negative_log_likelihoods = -np.log(correct_confidences)
return negative_log_likelihoods
class Data:
def __init__(self):
self.batch_size = 0
self.inputs = []
self.activations = []
self.weighted_inputs = []
self.nodeValues = []
class Network:
def __init__(self, size):
self.length = len(size) - 1
self.network = []
self.data = Data()
for i in range(len(size) - 1):
layer = Layer(size[i], size[i + 1])
self.network.append(layer)
# assuming that there must be at least 2 layers in the network
def forward(self, X):
# create new data object to store data for the current pass
self.data = Data()
self.data.batch_size = len(X)
# set first output to the input values passed into forward
current_output = X
self.data.activations.append(current_output)
# Pass forward through the neural network
for i in range(self.length - 1):
self.network[i].forward(current_output)
self.data.weighted_inputs.append(self.network[i].weighted_inputs)
self.network[i].hidden_activation()
current_output = (self.network[i].activations)
self.data.activations.append(current_output)
self.network[self.length - 1].forward(current_output)
self.data.weighted_inputs.append(self.network[self.length - 1].weighted_inputs)
self.network[self.length - 1].output_activation()
final_output = self.network[self.length - 1].activations
self.data.activations.append(final_output)
return final_output
def UpdateAllGradients(self, inputs, expectedOutputs, learnRate):
self.forward(inputs)
outputLayer = self.network[self.length - 1]
nodeValues = outputLayer.CalculateOutputLayerNodeValues(self.data, expectedOutputs)
outputLayer.UpdateGradients(self.data, nodeValues)
for i in reversed(range(self.length - 1)):
hiddenLayer = self.network[i]
nodeValues = hiddenLayer.CalculateHiddenLayerNodeValues(self.data, self.network[i + 1], nodeValues)
hiddenLayer.UpdateGradients(self.data, nodeValues)
for layer in network.network:
layer.ApplyGradients(learnRate)
def test_accuracy(self, inputs, expected):
total = 0
size = len(inputs)
inputs_flat = []
for x in range(len(inputs)):
inputs_flat.append(inputs[x].flatten())
outputs = self.forward(inputs_flat)
for sample in range(len(outputs)):
index = np.where(outputs[sample] == max(outputs[sample]))[0][0]
if index == expected[sample]:
total += 1
return total / size
(train_X, y_train), (test_X, y_test) = mnist.load_data()
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(y_train.shape))
print('X_test: ' + str(test_X.shape))
print('Y_test: ' + str(y_test.shape))
x_train = train_X.astype("float32") / 255
x_test = test_X.astype("float32") / 255
network = Network([784, 50, 16, 10])
cost_function = Loss_CategoricalCrossentropy()
costs = []
accuracy = []
for i in range(10000):
print("Pass: ", i, "\n")
inputs = [x_train[i].flatten()]
outputs = np.zeros((1, 10))
outputs[0][y_train[i]] = 1
network.UpdateAllGradients(inputs, outputs, 0.1)
print(np.array(network.data.activations[-1]), outputs)
accuracy.append(network.test_accuracy(x_test, y_test))
costs.append(cost_function.calculate(np.array(network.data.activations[-1]),
outputs))
print("\n")
plt.plot(accuracy)
plt.show()
Not exactly sure what is going wrong with the backpropagation. Any input is appreciated
So far, I've gone over the code for any errors and tried modifying details related to the neural network like the size of the network and layers. In addition, I tried modifying the random weights generation, but received similar results.