Cannot reduce loss effectively in a RNN using pytorch

50 Views Asked by At

I was trying to implement a RNN for generating dinosaur names (it was a project in a coursera class) in pytorch. I think I have the right setup for the network, but the model seems not able to reduce the loss effectively no matter how much I change the hyperparameters. The code is below. Can anyone have any ideas on what I'm doing wrong?

The training data is here: https://github.com/LRTuladhar/nn/blob/main/dinos.txt

# Package imports
import numpy as np
import copy
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model

import torch
import torch.nn.functional as F
import torch.nn as nn
torch.utils.data

import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
%matplotlib inline

torch.manual_seed(22)

lines = []  # create an empty list to store the lines

with open('dinos.txt', 'r') as file:
    for line in file:
        lines.append(line.strip().lower())  # append each line to the list after removing whitespace

maxlen = len(max(lines, key=len))
#print("The longest dino name:", maxlen)

num_lines = len(lines)
#print("Num lines:", len(lines))

#use '.' as the the EOL marker
chars = list(string.ascii_lowercase) + list('.')
vocab_size = len(chars)

#dictionaries for indexing
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# Parameters 
input_dim = vocab_size
output_dim = vocab_size #ie 27
seq_len = maxlen   #ie 26 

#Hyperparameters
hidden_dim = 32
n_layers = 1
batch_len = 10
learn_rate = 0.001
clip_limit = 0.1

# Training set and labels
X = torch.empty(0, dtype=torch.long)
Y = torch.empty(0, dtype=torch.long)

for line in lines:
    
    namelen = len(line)

    #convert to chars in line to ints
    xidvec = torch.tensor( [ char_to_ix[char] for char in line], dtype=torch.long )
    # pad with '.' to fill the rest of space
    pad = "."*(seq_len-namelen)
    padvec = torch.tensor( [ char_to_ix[char] for char in pad], dtype=torch.long )
    xidvec = torch.cat((xidvec, padvec), dim=0)
    #print(xidvec)
    
    #y should be x left shifted one step, and then add a '.' at the end
    yidvec = torch.tensor(xidvec[1:].tolist() , dtype=torch.long)
    yidvec = torch.cat( (yidvec, torch.tensor([26], dtype=torch.long)), dim=0)
    #print(yidvec)
    
    #Encode to one hot
    xonehot = F.one_hot(xidvec, vocab_size).type(torch.long)
    yonehot = F.one_hot(yidvec, vocab_size).type(torch.long)
    
    #Add this word to the training set X and labels Y
    X = torch.cat((X, xonehot), dim=0)
    Y = torch.cat((Y, yonehot), dim=0)
    
    
#Reshape it to feed to the dataset
X = X.reshape((-1, seq_len, vocab_size)).float()
Y = Y.reshape((-1, seq_len, vocab_size)).float()
    
# Wrap the tensors in a TensorDataset
dataset = TensorDataset(X, Y)

# Create a DataLoader 
dataloader = DataLoader(dataset, batch_size=batch_len, shuffle=True)


#Define the Class
class DinosNN(nn.Module):

        def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
            super(DinosNN, self).__init__()
            self.hidden_dim = hidden_dim
            self.GRU = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True)
            self.fc = nn.Linear(hidden_dim, output_dim)


        def forward(self, x, hidden):
            out, hidden = self.GRU(x, hidden)
            #out shape should be: batch_size, seq_len, hidden_dim (eg 1, 26, 32)

            out = out.reshape( -1, self.hidden_dim)
            # Should be batchsize*seq_length, hidden dim [eg 26, 32]
            
            out = self.fc(out)
            #out shape should be [ batchsize * seq_len, output_dim]
            
            return out, hidden
            

# Initialize the model and define the loss function and optimizer
dinomodel = DinosNN(input_dim, hidden_dim, output_dim, n_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(dinomodel.parameters(), lr=learn_rate)

print(dinomodel)

numepochs =100

lossi=[]
stepi=[]
step=0


for n in range(numepochs):
    #print("Epoch;", n)

    for i, (inputs, labels) in enumerate(dataloader):

        hidden = None
        
        # -- forward pass --
        yout, hidden = dinomodel(inputs, hidden)
        #yout shape: [batchsize*seq_length, output_dim]
        
        yout = yout.view(-1, seq_len, output_dim)
        #print("yout shape:", yout.shape)
        #shape: [ batchsize, seq_length, output_dim]
        
        #labels.shape should also be [batchsize, seq_length, hidden_dim]
        #print("labels shape:", labels.shape)
        loss = criterion(yout, labels)
        
        if( loss < 2 and  optimizer.param_groups[0]['lr'] == learn_rate ):
            print("reducing learn rate. loss:", loss)
            optimizer.param_groups[0]['lr'] = learn_rate/10
        
        # -- Backward, Clip and Optimize --
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(dinomodel.parameters(), max_norm=clip_limit)
        optimizer.step()

        if(step%10==0):
            stepi.append(step)
            lossi.append(loss.item())
        step+=1       
        
    if(n%10 == 0):
        print("Loss:", loss.item())

plt.plot(stepi,lossi)


The loss starts out around 3 and goes down to about 2 at most. When I generate the dinosaur names using a random seed, the generated names are not great. A typical loss graph looks like this:

enter image description here

And no matter how much I change the learning rate (from 1 to 0.0001) , I cannot get it to stabilize.

0

There are 0 best solutions below