Python ReLu activation function desn't work

1.9k Views Asked by At

My first neural network was using sigmoid activation function and was working fine. Now I want to switch to more advanced activation function(ReLu). But with ReLu my NN doesn't work at all. 90% of errors, while using sigmoid there were 4% of errors. I can't find bug in code. Help me.

class NeuralNetwork:
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate = 0.1):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate

        self.weights_ih = np.random.normal(0.0, pow(input_nodes, -0.5), (hidden_nodes, input_nodes))
        self.weights_ho = np.random.normal(0.0, pow(hidden_nodes, -0.5), (output_nodes, hidden_nodes))
        self.bias_h = np.random.normal(0.0, pow(1, -0.5), (hidden_nodes, 1))
        self.bias_o = np.random.normal(0.0, pow(1, -0.5), (output_nodes, 1))

    def activation_function(self, x):
        return x * (x > 0)

    def activation_function_d(self, x):
        return 1 * (x >= 0)

    def train(self, inputs_list, targets_list):
        inputs = np.array(inputs_list, ndmin=2).T
        targets = np.array(targets_list, ndmin=2).T

        # Feedforward
        hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
        hidden = self.activation_function(hidden_inputs)
        output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
        outputs = self.activation_function(output_inputs)

        # Calculate errors
        output_errors = targets - outputs
        hidden_errors = np.dot(self.weights_ho.T, output_errors)

        # Calculate gradients
        output_gradient = output_errors * self.activation_function_d(output_inputs) * self.learning_rate
        hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate

        # Calculate deltas
        output_deltas = np.dot(output_gradient, hidden.T)
        hidden_deltas = np.dot(hidden_gradient, inputs.T)

        # Adjust weights and biases by deltas and gradients
        self.weights_ho += output_deltas
        self.weights_ih += hidden_deltas
        self.bias_o     += output_gradient
        self.bias_h     += hidden_gradient

    def predict(self, inputs_list):
        inputs = np.array(inputs_list, ndmin=2).T
        hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
        outputs = self.activation_function(np.dot(self.weights_ho, hidden) + self.bias_o)
        return outputs.flatten().tolist()

And training code:

with open('mnist_train.csv') as train_file:
    for str in train_file:
        data = [int(char) for char in str.split(',')]
        inputs = data[1:]
        targets = [1 if i == data[0] else 0 for i in range(10)]
        nn.train(inputs, targets)
1

There are 1 best solutions below

4
On BEST ANSWER

The last layer should always use sigmoid (in the binary case) regardless of what you are trying to do.

The sigmoid function is used to estimate the probabilities that an example is in a given class, the prediction of an example is the class which the example has the highest probability to be in.

To conclude, change this:

def predict(self, inputs_list):
    inputs = np.array(inputs_list, ndmin=2).T
    hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
    outputs = self.activation_function(np.dot(self.weights_ho, hidden) + self.bias_o)
    return outputs.flatten().tolist()

to this

def predict(self, inputs_list):
    inputs = np.array(inputs_list, ndmin=2).T
    hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
    outputs = sigmoid(np.dot(self.weights_ho, hidden) + self.bias_o) // create a sigmoid function
    return outputs.flatten().tolist()

and in the training:

    # Feedforward
    hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
    hidden = self.activation_function(hidden_inputs)
    output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
    outputs = self.activation_function(output_inputs)

to:

    # Feedforward
    hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
    hidden = self.activation_function(hidden_inputs)
    output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
    outputs = sigmoid(output_inputs)

and

    # Calculate gradients
    output_gradient = output_errors * self.activation_function_d(output_inputs) * self.learning_rate
    hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate

to

       # Calculate gradients
    output_gradient = output_errors * sigmoid_d(output_inputs) * self.learning_rate
    hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate