Periodic/sinusoid MSE loss in the custom implementation of linear regression

55 Views Asked by At

I was implementing PyTorch-like modules (for educational purposes), and ran a simple training routine to check. However, my loss is oscillating, and I am not sure why.

Below is the code. I put the loop first, but the implementation of the layers is below (might need to rearrange if running locally).

Data Generation

# These are the parameters that we want to learn
parameters = np.array([1.3, 0.0])


def make_data(N, a, b, *, noise=0.1, x_min=0.0, x_max=1.0):
    X = np.random.rand(N) * (x_max - x_min) + x_min
    X = X.reshape(-1, 1)
    y = X * a + b + np.random.randn(N, 1) * noise
    X_line = np.array([x_min, x_max])
    y_line = X_line * a + b
    return (X, y), (X_line, y_line)

(X, y), (Xline, yline) = make_data(50, *parameters, noise=0.05)
(X_validation, y_validation), _ = make_data(50, *parameters, noise=0.05)

plt.scatter(X, y)
plt.scatter(X_validation, y_validation, alpha=0.5)
plt.plot(Xline, yline)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

enter image description here

Training loop (see below for module implementations)

criterion = MSELoss()
model = Sequential(
    Linear(1, 5, bias=True),
    ReLU(),
    Linear(5, 1, bias=True)
)


num_epochs = 1000

sgd_params = {
    'learning_rate': 1e-3,
    'weight_decay': 0.0,
    'schedule_scale': 1.0,
}

history = {
    'train': {
        'loss': [],
        'epoch': [],
    },
    'validation': {
        'loss': [],
        'epoch': [],
    }
}

for epoch in range(num_epochs):
    with TrainingContext(model, criterion) as tc:
        # Forward pass
        y_hat = model(X)
        loss = criterion(y_hat, y)

        # Backward pass in reverse order
        dL = criterion.backward(loss)
        model.backward(dL)

        # Update gradients
        model.update(**sgd_params)
        criterion.update(**sgd_params)
        
        # Scheduler
        sgd_params['learning_rate'] = sgd_params['learning_rate'] * sgd_params['schedule_scale']

    history['train']['epoch'].append(epoch)
    history['train']['loss'].append(loss)

    # Validation
    y_hat = model(X_validation)
    loss = criterion(y_hat, y_validation)
    history['validation']['epoch'].append(epoch)
    history['validation']['loss'].append(loss)

    # Tracking
    if (epoch+1) % 100 == 0:
        print(f'{epoch+1} / {num_epochs}: Training: {history["train"]["loss"][-1]:.2e} Validation: {history["validation"]["loss"][-1]:.2e}')

plt.plot(history['train']['epoch'], history['train']['loss'], label='Training')
plt.plot(history['validation']['epoch'], history['validation']['loss'], label='Validation')
plt.legend()

enter image description here

If I change the model to

model = Sequential(
    Linear(1, 1, bias=True),
)

I get enter image description here

Modules definition

Base module, Sequential wrapper, and Training context manager

class Module:
    def __init__(self):
        self._save_for_backward = {}
        self._grad = {}

        self.is_training = False  # Don't set this manually
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    def zero_grad(self):
        # print(f'===> DEBUG: zero_grad')
        if not self.is_training:
            raise RuntimeError('Please run zero_grad inside the training context')
        self._grad = {}
    
    def reset(self):
        self.zero_grad()
        self._save_for_backward = {}
    
    def save_for_backward(self, *args, **kwargs):
        r'''Saves or retrieves anything needed for training

        - If called with a positional argument ==> Returns saved value
        - If called with a keyword argument (with value assignment) ==> Saves the value
        '''
        if len(args) > 0 and len(kwargs) > 0:
            raise ValueError(f'Cannot save for backward and retrieve at the same time')
        elif len(args) == 0 and len(kwargs) == 0:
            return self._save_for_backward
        elif len(args) > 0:
            result = []
            for arg in args:
                result.append(self._save_for_backward[arg])
            if len(result) == 1:
                return result[0]
            else:
                return result
        elif self.is_training:
            for key, value in kwargs.items():
                self._save_for_backward[key] = value
        return None

    def update(self, *args, **kwargs):
        pass


class Sequential(Module):
    def __init__(self, *modules):
        self.modules = modules
        super().__init__()
    
    def forward(self, X, *args, **kwargs):
        for mod in self.modules:
            X = mod(X)
        return X
    
    def backward(self, dLdy):
        grad = dLdy
        # print(grad.shape)
        for mod in self.modules[::-1]:
            grad = mod.backward(grad)
            # print(grad.shape)
        return grad

    def update(self, *args, **kwargs):
        for mod in self.modules:
            mod.update(*args, **kwargs)

    @property
    def is_training(self):
        is_training = []
        for mod in self.modules:
            is_training.append(mod.is_training)
        return is_training
    
    @is_training.setter
    def is_training(self, value):
        if not isinstance(value, (list, tuple)):
            value = [value] * len(self.modules)
        for idx, mod in enumerate(self.modules):
            mod.is_training = value[idx]


class TrainingContext:
    r'''Makes sure the modules are in the training mode

    Usage:
        with TrainingContext(layer1, layer2, loss) as tc:
            ...
    '''
    def __init__(self, *modules, reset_on_exit=False):
        self.modules = modules
        self.old_states = []
        self.reset_on_exit = reset_on_exit
    
    def __enter__(self):
        for mod in self.modules:
            self.old_states.append(mod.is_training)
            mod.is_training = True
    
    def __exit__(self, *args, **kwargs):
        for idx, mod in enumerate(self.modules):
            mod.is_training = self.old_states[idx]
            if self.reset_on_exit:
                mod.reset()

MSE Loss and ReLU

class MSELoss(Module):
    def forward(self, y_hat, y):
        diff = y_hat - y
        self.save_for_backward(diff=diff, k=len(y))
        diff_sq = diff * diff
        return 0.5 * diff_sq.mean()
    
    def backward(self, loss):
        diff = self.save_for_backward('diff')
        k = self.save_for_backward('k')
        self._grad['loss'] = self._grad.get('loss', np.zeros_like(diff))
        self._grad['loss'] += diff / k
        return self._grad['loss']


class ReLU(Module):
    def forward(self, X):
        zeromask = X <= 0.0
        self.save_for_backward(zeromask=zeromask)
        y = X.copy()
        y[zeromask] = 0.0
        return y
    
    def backward(self, dLdy):
        dLdX = dLdy.copy()
        zeromask = self.save_for_backward('zeromask')
        dLdX[zeromask] = 0.0
        return dLdX

Linear Layer

class Linear(Module):
    def __init__(self, Cin, Cout, bias=True):
        super().__init__()
        self.Cin = Cin
        self.Cout = Cout
        self.weight = np.random.randn(Cin, Cout)
        self.bias = np.zeros(Cout) if bias else None

    def forward(self, X):
        # print(f'===> DEBUG: forward')
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        if self.is_training:
            self.save_for_backward(X=X.copy())
        y = X @ self.weight
        return y

    def backward(self, dLdy):
        # dLdy.shape = N x Cout
        # dydw.shape = N x Cin
        # print(f'===> DEBUG: backward')
        if not self.is_training:
            raise RuntimeError('Please run backward inside the training context')
        dydX = self.weight.T
        dLdX = dLdy @ dydX
        
        dydw = self.save_for_backward('X')
        self._grad['weight'] = self._grad.get('weight', np.zeros_like(self.weight))
        self._grad['weight'] += dydw.T @ dLdy

        if self.bias is not None:
            self._grad['bias'] = self._grad.get('bias', np.zeros_like(self.bias))
            self._grad['bias'] += dLdy.sum(0)
        
        return dLdX

    def update(self, learning_rate=1e-3, weight_decay=1e-4, zero_grad=True, *args, **kwargs):
        # print(f'===> DEBUG: update')
        if not self.is_training:
            raise RuntimeError('Please run update inside the training context')
        self.weight -= learning_rate * (self._grad['weight'] + weight_decay * self.weight)
        if self.bias is not None:
            self.bias -= learning_rate * (self._grad['bias'] + weight_decay * self.bias)
        if zero_grad:
            self.zero_grad()
0

There are 0 best solutions below