nan in numpy dot product

599 Views Asked by At

Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)

#
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
    def __init__(self):

    self.batch_size = 256
    self.input_size = 5 # input dimension is 5
    self.hidden_layer1_size = 50
    self.output_size = 1  # output dimension is 5
    self.train_data = data['training_data_pos']
    self.df_traindata = pd.DataFrame(data=self.train_data)
    self.validation_data_num = 17142
    self.valid_data = data['validation_data_pos']
    self.df_validdata = pd.DataFrame(data=self.valid_data)

    # weight initialization for ReLu
    self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
    self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)

    #bias initialization
    self.b1 = np.zeros((1,self.hidden_layer1_size))
    self.b2 = np.zeros((1,self.output_size))

    self.lr = 5e-3      # learning rate
    self.reg = 1e-3       # regularization strength
    self.p = 0.5          # dropout probability = 1-p

    self.first_moment_W3=0
    self.second_moment_W3=0
    self.first_moment_W2=0
    self.second_moment_W2=0
    self.first_moment_W1=0
    self.second_moment_W1=0
    self.first_moment_b3=0
    self.second_moment_b3=0
    self.first_moment_b2=0
    self.second_moment_b2=0
    self.first_moment_b1=0
    self.second_moment_b1=0

def feedforward(self):   
    ### randomly selected mini-batch as inputs
    self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
    self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
    self.train_output = self.df_sample_t.as_matrix(columns=[5])

    #hidden layer with dropput technique
    self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
    U1= np.random.rand(*self.hidden_layer1.shape) < self.p  # drop mask
    self.hidden_layer1 *= U1  # drop!

    self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
    self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
    self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
    self.total_loss = self.data_loss + self.reg_loss

def backpropagation(self):

    self.d_output = (self.output_layer-self.train_output)/ self.batch_size

    #data part            
    self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
    self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
    self.dhidden1 = np.dot(self.d_output, self.W2.T)
    self.dhidden1[self.hidden_layer1<= 0] = 0

    self.dW1 = np.dot(self.train_input.T, self.dhidden1)
    self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)

    #regularization part
    self.dW2 = self.dW2 + self.reg * self.W2
    self.dW1 = self.dW1 + self.reg * self.W1

def Adam(self, epoch, dW2, dW1, db2, db1):

    beta1 = 0.9
    beta2 = 0.99

    self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
    self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
    first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
    second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
    self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)

    self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
    self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
    first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
    second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
    self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)

    self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
    self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
    first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
    second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
    self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)

    self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
    self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
    first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
    second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
    self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)

def validation(self):
    self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
    self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
    self.valid_output = self.df_sample_v.as_matrix(columns=[5])
    self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
    self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
    self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
    self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
    self.total_loss = self.data_loss + self.reg_loss


NN = NeuralNetwork()
num_iterations = 120

training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T

# Training and validation
while(t < num_iterations):
    NN.feedforward()
    NN.backpropagation()
    NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
    training_loss = np.append(training_loss, NN.total_loss)
    if t % 10 == 0:
        print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
    NN.validation()
    validation_loss = np.append(validation_loss, NN.total_loss)
    validation_dataloss = np.append(validation_dataloss, NN.data_loss)
    if t % 10 == 0:
        print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
    t+=1
0

There are 0 best solutions below