I was trying to implement a RNN for generating dinosaur names (it was a project in a coursera class) in pytorch. I think I have the right setup for the network, but the model seems not able to reduce the loss effectively no matter how much I change the hyperparameters. The code is below. Can anyone have any ideas on what I'm doing wrong?
The training data is here: https://github.com/LRTuladhar/nn/blob/main/dinos.txt
# Package imports
import numpy as np
import copy
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
import torch
import torch.nn.functional as F
import torch.nn as nn
torch.utils.data
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
%matplotlib inline
torch.manual_seed(22)
lines = [] # create an empty list to store the lines
with open('dinos.txt', 'r') as file:
for line in file:
lines.append(line.strip().lower()) # append each line to the list after removing whitespace
maxlen = len(max(lines, key=len))
#print("The longest dino name:", maxlen)
num_lines = len(lines)
#print("Num lines:", len(lines))
#use '.' as the the EOL marker
chars = list(string.ascii_lowercase) + list('.')
vocab_size = len(chars)
#dictionaries for indexing
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
# Parameters
input_dim = vocab_size
output_dim = vocab_size #ie 27
seq_len = maxlen #ie 26
#Hyperparameters
hidden_dim = 32
n_layers = 1
batch_len = 10
learn_rate = 0.001
clip_limit = 0.1
# Training set and labels
X = torch.empty(0, dtype=torch.long)
Y = torch.empty(0, dtype=torch.long)
for line in lines:
namelen = len(line)
#convert to chars in line to ints
xidvec = torch.tensor( [ char_to_ix[char] for char in line], dtype=torch.long )
# pad with '.' to fill the rest of space
pad = "."*(seq_len-namelen)
padvec = torch.tensor( [ char_to_ix[char] for char in pad], dtype=torch.long )
xidvec = torch.cat((xidvec, padvec), dim=0)
#print(xidvec)
#y should be x left shifted one step, and then add a '.' at the end
yidvec = torch.tensor(xidvec[1:].tolist() , dtype=torch.long)
yidvec = torch.cat( (yidvec, torch.tensor([26], dtype=torch.long)), dim=0)
#print(yidvec)
#Encode to one hot
xonehot = F.one_hot(xidvec, vocab_size).type(torch.long)
yonehot = F.one_hot(yidvec, vocab_size).type(torch.long)
#Add this word to the training set X and labels Y
X = torch.cat((X, xonehot), dim=0)
Y = torch.cat((Y, yonehot), dim=0)
#Reshape it to feed to the dataset
X = X.reshape((-1, seq_len, vocab_size)).float()
Y = Y.reshape((-1, seq_len, vocab_size)).float()
# Wrap the tensors in a TensorDataset
dataset = TensorDataset(X, Y)
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=batch_len, shuffle=True)
#Define the Class
class DinosNN(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
super(DinosNN, self).__init__()
self.hidden_dim = hidden_dim
self.GRU = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x, hidden):
out, hidden = self.GRU(x, hidden)
#out shape should be: batch_size, seq_len, hidden_dim (eg 1, 26, 32)
out = out.reshape( -1, self.hidden_dim)
# Should be batchsize*seq_length, hidden dim [eg 26, 32]
out = self.fc(out)
#out shape should be [ batchsize * seq_len, output_dim]
return out, hidden
# Initialize the model and define the loss function and optimizer
dinomodel = DinosNN(input_dim, hidden_dim, output_dim, n_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(dinomodel.parameters(), lr=learn_rate)
print(dinomodel)
numepochs =100
lossi=[]
stepi=[]
step=0
for n in range(numepochs):
#print("Epoch;", n)
for i, (inputs, labels) in enumerate(dataloader):
hidden = None
# -- forward pass --
yout, hidden = dinomodel(inputs, hidden)
#yout shape: [batchsize*seq_length, output_dim]
yout = yout.view(-1, seq_len, output_dim)
#print("yout shape:", yout.shape)
#shape: [ batchsize, seq_length, output_dim]
#labels.shape should also be [batchsize, seq_length, hidden_dim]
#print("labels shape:", labels.shape)
loss = criterion(yout, labels)
if( loss < 2 and optimizer.param_groups[0]['lr'] == learn_rate ):
print("reducing learn rate. loss:", loss)
optimizer.param_groups[0]['lr'] = learn_rate/10
# -- Backward, Clip and Optimize --
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(dinomodel.parameters(), max_norm=clip_limit)
optimizer.step()
if(step%10==0):
stepi.append(step)
lossi.append(loss.item())
step+=1
if(n%10 == 0):
print("Loss:", loss.item())
plt.plot(stepi,lossi)
The loss starts out around 3 and goes down to about 2 at most. When I generate the dinosaur names using a random seed, the generated names are not great. A typical loss graph looks like this:
And no matter how much I change the learning rate (from 1 to 0.0001) , I cannot get it to stabilize.