Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)
#import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
def __init__(self):
self.batch_size = 256
self.input_size = 5 # input dimension is 5
self.hidden_layer1_size = 50
self.output_size = 1 # output dimension is 5
self.train_data = data['training_data_pos']
self.df_traindata = pd.DataFrame(data=self.train_data)
self.validation_data_num = 17142
self.valid_data = data['validation_data_pos']
self.df_validdata = pd.DataFrame(data=self.valid_data)
# weight initialization for ReLu
self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)
#bias initialization
self.b1 = np.zeros((1,self.hidden_layer1_size))
self.b2 = np.zeros((1,self.output_size))
self.lr = 5e-3 # learning rate
self.reg = 1e-3 # regularization strength
self.p = 0.5 # dropout probability = 1-p
self.first_moment_W3=0
self.second_moment_W3=0
self.first_moment_W2=0
self.second_moment_W2=0
self.first_moment_W1=0
self.second_moment_W1=0
self.first_moment_b3=0
self.second_moment_b3=0
self.first_moment_b2=0
self.second_moment_b2=0
self.first_moment_b1=0
self.second_moment_b1=0
def feedforward(self):
### randomly selected mini-batch as inputs
self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
self.train_output = self.df_sample_t.as_matrix(columns=[5])
#hidden layer with dropput technique
self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
U1= np.random.rand(*self.hidden_layer1.shape) < self.p # drop mask
self.hidden_layer1 *= U1 # drop!
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
def backpropagation(self):
self.d_output = (self.output_layer-self.train_output)/ self.batch_size
#data part
self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
self.dhidden1 = np.dot(self.d_output, self.W2.T)
self.dhidden1[self.hidden_layer1<= 0] = 0
self.dW1 = np.dot(self.train_input.T, self.dhidden1)
self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)
#regularization part
self.dW2 = self.dW2 + self.reg * self.W2
self.dW1 = self.dW1 + self.reg * self.W1
def Adam(self, epoch, dW2, dW1, db2, db1):
beta1 = 0.9
beta2 = 0.99
self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)
self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)
self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)
self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)
def validation(self):
self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
self.valid_output = self.df_sample_v.as_matrix(columns=[5])
self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
NN = NeuralNetwork()
num_iterations = 120
training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T
# Training and validation
while(t < num_iterations):
NN.feedforward()
NN.backpropagation()
NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
training_loss = np.append(training_loss, NN.total_loss)
if t % 10 == 0:
print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
NN.validation()
validation_loss = np.append(validation_loss, NN.total_loss)
validation_dataloss = np.append(validation_dataloss, NN.data_loss)
if t % 10 == 0:
print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
t+=1