I generated and trained a transformer model Using the following code
from tempfile import TemporaryDirectory
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import pandas as pd
import tiktoken
import copy
import os
#todo fix cuda dependency later
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def scaled_dot_product_attention(self, Q, K, V, mask=None):
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
attn_probs = torch.softmax(attn_scores, dim=-1)
output = torch.matmul(attn_probs, V)
return output
def split_heads(self, x):
batch_size, seq_length, d_model = x.size()
return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
def combine_heads(self, x):
batch_size, _, seq_length, d_k = x.size()
return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
def forward(self, Q, K, V, mask=None):
Q = self.split_heads(self.W_q(Q))
K = self.split_heads(self.W_k(K))
V = self.split_heads(self.W_v(V))
attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
output = self.W_o(self.combine_heads(attn_output))
return output
class PositionWiseFeedForward(nn.Module):
def __init__(self, d_model, d_ff):
super(PositionWiseFeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.relu = nn.ReLU()
def forward(self, x):
return self.fc2(self.relu(self.fc1(x)))
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_seq_length):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_seq_length, d_model)
position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
return x + self.pe[:, :x.size(1)]
class EncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout):
super(EncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return x
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout):
super(DecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.cross_attn = MultiHeadAttention(d_model, num_heads)
self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_output, src_mask, tgt_mask):
attn_output = self.self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x
class Transformer(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
super(Transformer, self).__init__()
self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
self.fc = nn.Linear(d_model, tgt_vocab_size)
self.dropout = nn.Dropout(dropout)
def generate_mask(self, src, tgt):
src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
seq_length = tgt.size(1)
nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to('cuda')
tgt_mask = tgt_mask.to('cuda') & nopeak_mask.to('cuda')
return src_mask, tgt_mask
def forward(self, src, tgt):
src_mask, tgt_mask = self.generate_mask(src.to('cuda'), tgt.to('cuda'))
src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
enc_output = src_embedded
for enc_layer in self.encoder_layers:
enc_output = enc_layer(enc_output, src_mask)
dec_output = tgt_embedded
for dec_layer in self.decoder_layers:
dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
output = self.fc(dec_output)
return output
src_vocab_size = 100160
tgt_vocab_size = 79258
d_model = 50
num_heads = 5
num_layers = 3
d_ff = 2048
max_seq_length = 100
dropout = 0.1
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
transformer.to('cuda')
enctype = "cl100k_base"
encoding = tiktoken.get_encoding(enctype)
df = pd.read_csv('filepath', sep='\t', header = None)
df.columns = ['Column1', 'Column2', 'Column3', 'Column4']
df['Column2'] = df['Column2'].str.cat(df['Column3'], sep=' ', na_rep='')
#print(df.iloc[886, 0])
#print(df.iloc[886, 1])
#print(df.iloc[886, 2])
#src_data = torch.randint(1, 50, (64, 50))
#print(src_data)
source_list = [(encoding.encode(text)) for text in df['Column1']]
trgt_list = [(encoding.encode(text)) for text in df['Column2']]
source_tensors = []
for sublist in source_list:
temptensor = torch.tensor(sublist[:16]) # Truncate sublist to length 16
temptensor.to('cuda')
if len(temptensor) < 16:
padded_tensor = torch.zeros(16) # Create a tensor of zeros
padded_tensor.to('cuda')
padded_tensor[:len(temptensor)] = temptensor # Copy sublist elements into the padded tensor
temptensor = padded_tensor
source_tensors.append(temptensor)
target_tensors = []
for slis in trgt_list:
temporarytens = torch.tensor(slis[:16]) # Truncate sublist to length 16
temporarytens.to('cuda')
if len(temporarytens) < 16:
padded_tensy = torch.zeros(16) # Create a tensor of zeros
padded_tensy.to('cuda')
padded_tensy[:len(temporarytens)] = temporarytens # Copy sublist elements into the padded tensor
temporarytens = padded_tensy
target_tensors.append(temporarytens)
print (torch.cat(source_tensors).reshape(41316, 16))
print (torch.cat(target_tensors).reshape(41316, 16))
src_data = (torch.cat(source_tensors).reshape(41316, 16)).long()
tgt_data = (torch.cat(target_tensors).reshape(41316, 16)).long()
src_data = src_data.to('cuda')
tgt_data = tgt_data.to('cuda')
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
transformer.to('cuda')
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
transformer.train()
# Define the batch size and number of splits
batch_size = 64
num_splits = math.ceil(src_data.shape[0] / batch_size)
best_loss = float('inf')
num_epochs = 70 # Set the desired number of epochs
with TemporaryDirectory() as tempdir:
best_model_params_path = os.path.join(tempdir, "best_model_params.pt")
for epoch in range(num_epochs):
total_loss = 0.0
for i in range(num_splits):
optimizer.zero_grad()
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, src_data.shape[0])
src_batch = src_data[start_idx:end_idx]
tgt_batch = tgt_data[start_idx:end_idx]
output = transformer(src_batch, tgt_batch[:, :-1])
loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_batch[:, 1:].contiguous().view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
if loss < best_loss:
best_loss = loss
torch.save(transformer.state_dict(), best_model_params_path)
print(f"Epoch: {epoch+1}, Loss: {total_loss / num_splits}")
transformer.load_state_dict(torch.load(best_model_params_path))
torch.save(transformer, "path.pth")
But when trying to interact with the generated model and get it to return an output it does not seem to work
the closest I have gotten to an actual solution was running the following code
model = torch.load("filepath.pth")
num_weights = sum(p.numel() for p in model.parameters())
print("The model has {} weights.".format(num_weights))
enctype = "cl100k_base"
encoding = tiktoken.get_encoding(enctype)
input = torch.tensor(encoding.encode("AACS"))
output = model(input, input[:, :-1])
print(encoding.decode(output))
The transformer class above was copy pasted into the file as well otherwise the code would refuse to work at all, the current error I am getting with this code is
Traceback (most recent call last):
File "filepath.py", line 164, in <module>
output = model(input, input[:, :-1])
IndexError: too many indices for tensor of dimension 1
I also tried a modified version of the code which I dont have anymore but it was similar to the code above which resulted in the error message
Traceback (most recent call last):
File "filepath.py", line 164, in <module>
output = model(input, input)
File "filepath.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "filepath.py", line 138, in forward
src_mask, tgt_mask = self.generate_mask(src.to('cuda'), tgt.to('cuda'))
File "filepath.py", line 131, in generate_mask
seq_length = -tgt.size(1)
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)