Why autoencoder is not learning?

106 Views Asked by At

The loss function is MSE which is not decreasing (2.32 to 2.24). What is the problem AE architecture or the way I train the AE? After 100 epochs the loss doesn't change. Is the input data(200,1,52) can't be compressed? Should I increase the compressed data size(200,16) by changing the encoder architecture?

Training Loss curve

# Standard Torch Packages
import torch
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
from torch import optim
from os.path import join, exists
from os import mkdir
import numpy as np
import glob
import matplotlib.pyplot as plt

# Import own functions
from ae_model import AE
from learning import EarlyStopping, ReduceLROnPlateau, LSIZE

# Define parameters
num_epochs = 50
rollout_path = "data/rollouts/rollout_*.npz"
logdir = "data/"

X = []
for x in glob.glob(rollout_path):
    data_point = np.load(x, allow_pickle=True)
    X.append(data_point)

train_loader, test_loader = train_test_split(X, test_size=0.2, shuffle= False)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
print(device)

model = AE(LSIZE).to(device)
#optimizer = optim.Adam(model.parameters())
optimizer = torch.optim.Adam(model.parameters(),
                             lr = 1e-1,
                             weight_decay = 1e-8)
scheduler = ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=5)
earlystopping = EarlyStopping("min", patience=3)

# Validation using MSE Loss function
loss_function = torch.nn.MSELoss()

def train(epoch):
    """ One training epoch """
    model.train()
    train_loss = []
    #train_data_points = 0
    for batch_idx, rollout in enumerate(train_loader):
        data = torch.tensor(rollout["data"]).to(device)
        train_data_points = len(data) * len(train_loader)
        #recon_batch, mu, logvar = model(data)
        recon_batch, _ = model(data)
        loss = loss_function(recon_batch, data)
        optimizer.zero_grad()
        loss.backward()
        #train_loss += loss.item()
        optimizer.step()
        plot_train_data.append(loss.item())

    #     if batch_idx % 20 == 0:
    #         print(
    #             "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
    #                 epoch,
    #                 batch_idx * len(data),
    #                 train_data_points,
    #                 100.0 * batch_idx / len(train_loader),
    #                 loss.item() / len(data),
    #             )
    #         )
    # train_loss /= train_data_points
    # print(
    #     "====> Epoch: {} Average loss: {:.4f}".format(
    #         epoch, train_loss / train_data_points
    #     )
        print(
        "====> Epoch: {} batchId: {} Average loss: {:.4f}".format(
            epoch, batch_idx, loss.item()
        ))
    # )

    return train_loss

def test():
    model.eval()
    test_loss = 0
    test_data_points = 0
    with torch.no_grad():
        for rollout in test_loader:
            data = torch.tensor(rollout["data"]).to(device)
            test_data_points = len(data) * len(test_loader)
            #recon_batch, mu, logvar = model(data)
            recon_batch, _ = model(data)
            test_loss += loss_function(recon_batch, data).item()

    test_loss /= test_data_points
    print("====> Test set loss: {:.4f}".format(test_loss))
    return test_loss


def save_checkpoint(state, is_best, filename, best_filename):
    """ Save state in filename. Also save in best_filename if is_best. """
    torch.save(state, filename)
    if is_best:
        torch.save(state, best_filename)


# check vae dir exists, if not, create it
ae_dir = join(logdir, "ae_gpu_run_false")
if not exists(ae_dir):
    mkdir(ae_dir)

reload_file = join(ae_dir, "best.tar")
noreload = False

if not noreload and exists(reload_file):
    state = torch.load(reload_file)
    print(
        "Reloading model at epoch {}"
        ", with test error {}".format(state["epoch"], state["precision"])
    )
    model.load_state_dict(state["state_dict"])
    optimizer.load_state_dict(state["optimizer"])
    scheduler.load_state_dict(state["scheduler"])
    earlystopping.load_state_dict(state["earlystopping"])

cur_best = None
plot_train_data = []
plot_test_data = []

for epoch in range(1, num_epochs + 1):
    
    #plot_train_data.append(train(epoch))
    train(epoch)
    test_loss = test()
    scheduler.step(test_loss)
    earlystopping.step(test_loss)

    # checkpointing
    best_filename = join(ae_dir, "best.tar")
    filename = join(ae_dir, "checkpoint.tar")
    is_best = not cur_best or test_loss < cur_best
    if is_best:
        cur_best = test_loss

    save_checkpoint(
        {
            "epoch": epoch,
            "state_dict": model.state_dict(),
            "precision": test_loss,
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "earlystopping": earlystopping.state_dict(),
        },
        is_best,
        filename,
        best_filename,
    )

    if earlystopping.stop:
        print("End of Training because of early stopping at epoch {}".format(epoch))
        break

test_plot_path = join(ae_dir, "test_fig.png")
# legend_strings = []
plt.title("AE Training and Test")
#plt.xlabel("Epochs")
#plt.ylabel("MSE losses")
#plt.plot(plot_test_data)
# legend_strings.append('Test')
#plt.legend('Test')
#plt.savefig(test_plot_path)
#plt.close()

#train_plot_path = join(ae_dir, "train_fig.png")
#plt.title("AE ")
#plt.xlabel("Epochs")
#plt.ylabel("MSE Loss")
plt.plot(plot_train_data)
# legend_strings.append('Train')
#plt.legend('Train')
plt.xticks(range(0, len(plot_train_data), 75))
plt.savefig(test_plot_path)
plt.close()

Below is the encoder and decoder model:

import torch
import torch.nn as nn
import torch.nn.functional as F

reduced_size = 22


class Decoder(nn.Module):
    """ VAE decoder """

    def __init__(self, latent_size):
        super(Decoder, self).__init__()
        self.latent_size = latent_size
        self.fc1 = nn.Linear(latent_size, reduced_size)
        self.deconv1 = nn.ConvTranspose1d(16, 32, 1, stride=1)
        self.deconv2 = nn.ConvTranspose1d(32, 52, 1, stride=1)

    def forward(self, x):  # pylint: disable=arguments-differ
        x = x.unsqueeze(2)
        x = F.relu(self.deconv1(x))
        x = torch.sigmoid(self.deconv2(x))
        x = x.view(x.size(0), x.size(2), x.size(1))
        return x


class Encoder(nn.Module):  # pylint: disable=too-many-instance-attributes
    """ VAE encoder """

    def __init__(self, latent_size):
        super(Encoder, self).__init__()
        # input shape (200, 1, 52)
        # batch_size, in_channel, len_channel
        self.latent_size = latent_size
        self.conv1 = nn.Conv1d(52, 32, 1, stride=1)
        self.conv2 = nn.Conv1d(32, 16, 1, stride=1)
        # output shape (200, 1, x)
        self.fc_mu = nn.Linear(reduced_size, latent_size)

    def forward(self, x):  # pylint: disable=arguments-differ
        x = x.view(x.size(0), x.size(2), x.size(1))
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        return x


class AE(nn.Module):
    """ Variational Autoencoder """

    def __init__(self, latent_size):
        super(AE, self).__init__()
        self.encoder = Encoder(latent_size)
        self.decoder = Decoder(latent_size)

    def forward(self, x):  # pylint: disable=arguments-differ
        x = x.unsqueeze(1)
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        decoded = decoded.squeeze(1)

        return decoded, encoded
0

There are 0 best solutions below