The loss function is MSE which is not decreasing (2.32 to 2.24). What is the problem AE architecture or the way I train the AE? After 100 epochs the loss doesn't change. Is the input data(200,1,52) can't be compressed? Should I increase the compressed data size(200,16) by changing the encoder architecture?
# Standard Torch Packages
import torch
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
from torch import optim
from os.path import join, exists
from os import mkdir
import numpy as np
import glob
import matplotlib.pyplot as plt
# Import own functions
from ae_model import AE
from learning import EarlyStopping, ReduceLROnPlateau, LSIZE
# Define parameters
num_epochs = 50
rollout_path = "data/rollouts/rollout_*.npz"
logdir = "data/"
X = []
for x in glob.glob(rollout_path):
data_point = np.load(x, allow_pickle=True)
X.append(data_point)
train_loader, test_loader = train_test_split(X, test_size=0.2, shuffle= False)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
print(device)
model = AE(LSIZE).to(device)
#optimizer = optim.Adam(model.parameters())
optimizer = torch.optim.Adam(model.parameters(),
lr = 1e-1,
weight_decay = 1e-8)
scheduler = ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=5)
earlystopping = EarlyStopping("min", patience=3)
# Validation using MSE Loss function
loss_function = torch.nn.MSELoss()
def train(epoch):
""" One training epoch """
model.train()
train_loss = []
#train_data_points = 0
for batch_idx, rollout in enumerate(train_loader):
data = torch.tensor(rollout["data"]).to(device)
train_data_points = len(data) * len(train_loader)
#recon_batch, mu, logvar = model(data)
recon_batch, _ = model(data)
loss = loss_function(recon_batch, data)
optimizer.zero_grad()
loss.backward()
#train_loss += loss.item()
optimizer.step()
plot_train_data.append(loss.item())
# if batch_idx % 20 == 0:
# print(
# "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
# epoch,
# batch_idx * len(data),
# train_data_points,
# 100.0 * batch_idx / len(train_loader),
# loss.item() / len(data),
# )
# )
# train_loss /= train_data_points
# print(
# "====> Epoch: {} Average loss: {:.4f}".format(
# epoch, train_loss / train_data_points
# )
print(
"====> Epoch: {} batchId: {} Average loss: {:.4f}".format(
epoch, batch_idx, loss.item()
))
# )
return train_loss
def test():
model.eval()
test_loss = 0
test_data_points = 0
with torch.no_grad():
for rollout in test_loader:
data = torch.tensor(rollout["data"]).to(device)
test_data_points = len(data) * len(test_loader)
#recon_batch, mu, logvar = model(data)
recon_batch, _ = model(data)
test_loss += loss_function(recon_batch, data).item()
test_loss /= test_data_points
print("====> Test set loss: {:.4f}".format(test_loss))
return test_loss
def save_checkpoint(state, is_best, filename, best_filename):
""" Save state in filename. Also save in best_filename if is_best. """
torch.save(state, filename)
if is_best:
torch.save(state, best_filename)
# check vae dir exists, if not, create it
ae_dir = join(logdir, "ae_gpu_run_false")
if not exists(ae_dir):
mkdir(ae_dir)
reload_file = join(ae_dir, "best.tar")
noreload = False
if not noreload and exists(reload_file):
state = torch.load(reload_file)
print(
"Reloading model at epoch {}"
", with test error {}".format(state["epoch"], state["precision"])
)
model.load_state_dict(state["state_dict"])
optimizer.load_state_dict(state["optimizer"])
scheduler.load_state_dict(state["scheduler"])
earlystopping.load_state_dict(state["earlystopping"])
cur_best = None
plot_train_data = []
plot_test_data = []
for epoch in range(1, num_epochs + 1):
#plot_train_data.append(train(epoch))
train(epoch)
test_loss = test()
scheduler.step(test_loss)
earlystopping.step(test_loss)
# checkpointing
best_filename = join(ae_dir, "best.tar")
filename = join(ae_dir, "checkpoint.tar")
is_best = not cur_best or test_loss < cur_best
if is_best:
cur_best = test_loss
save_checkpoint(
{
"epoch": epoch,
"state_dict": model.state_dict(),
"precision": test_loss,
"optimizer": optimizer.state_dict(),
"scheduler": scheduler.state_dict(),
"earlystopping": earlystopping.state_dict(),
},
is_best,
filename,
best_filename,
)
if earlystopping.stop:
print("End of Training because of early stopping at epoch {}".format(epoch))
break
test_plot_path = join(ae_dir, "test_fig.png")
# legend_strings = []
plt.title("AE Training and Test")
#plt.xlabel("Epochs")
#plt.ylabel("MSE losses")
#plt.plot(plot_test_data)
# legend_strings.append('Test')
#plt.legend('Test')
#plt.savefig(test_plot_path)
#plt.close()
#train_plot_path = join(ae_dir, "train_fig.png")
#plt.title("AE ")
#plt.xlabel("Epochs")
#plt.ylabel("MSE Loss")
plt.plot(plot_train_data)
# legend_strings.append('Train')
#plt.legend('Train')
plt.xticks(range(0, len(plot_train_data), 75))
plt.savefig(test_plot_path)
plt.close()
Below is the encoder and decoder model:
import torch
import torch.nn as nn
import torch.nn.functional as F
reduced_size = 22
class Decoder(nn.Module):
""" VAE decoder """
def __init__(self, latent_size):
super(Decoder, self).__init__()
self.latent_size = latent_size
self.fc1 = nn.Linear(latent_size, reduced_size)
self.deconv1 = nn.ConvTranspose1d(16, 32, 1, stride=1)
self.deconv2 = nn.ConvTranspose1d(32, 52, 1, stride=1)
def forward(self, x): # pylint: disable=arguments-differ
x = x.unsqueeze(2)
x = F.relu(self.deconv1(x))
x = torch.sigmoid(self.deconv2(x))
x = x.view(x.size(0), x.size(2), x.size(1))
return x
class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
""" VAE encoder """
def __init__(self, latent_size):
super(Encoder, self).__init__()
# input shape (200, 1, 52)
# batch_size, in_channel, len_channel
self.latent_size = latent_size
self.conv1 = nn.Conv1d(52, 32, 1, stride=1)
self.conv2 = nn.Conv1d(32, 16, 1, stride=1)
# output shape (200, 1, x)
self.fc_mu = nn.Linear(reduced_size, latent_size)
def forward(self, x): # pylint: disable=arguments-differ
x = x.view(x.size(0), x.size(2), x.size(1))
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = x.view(x.size(0), -1)
return x
class AE(nn.Module):
""" Variational Autoencoder """
def __init__(self, latent_size):
super(AE, self).__init__()
self.encoder = Encoder(latent_size)
self.decoder = Decoder(latent_size)
def forward(self, x): # pylint: disable=arguments-differ
x = x.unsqueeze(1)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
decoded = decoded.squeeze(1)
return decoded, encoded