I was implementing PyTorch-like modules (for educational purposes), and ran a simple training routine to check. However, my loss is oscillating, and I am not sure why.
Below is the code. I put the loop first, but the implementation of the layers is below (might need to rearrange if running locally).
Data Generation
# These are the parameters that we want to learn
parameters = np.array([1.3, 0.0])
def make_data(N, a, b, *, noise=0.1, x_min=0.0, x_max=1.0):
X = np.random.rand(N) * (x_max - x_min) + x_min
X = X.reshape(-1, 1)
y = X * a + b + np.random.randn(N, 1) * noise
X_line = np.array([x_min, x_max])
y_line = X_line * a + b
return (X, y), (X_line, y_line)
(X, y), (Xline, yline) = make_data(50, *parameters, noise=0.05)
(X_validation, y_validation), _ = make_data(50, *parameters, noise=0.05)
plt.scatter(X, y)
plt.scatter(X_validation, y_validation, alpha=0.5)
plt.plot(Xline, yline)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
Training loop (see below for module implementations)
criterion = MSELoss()
model = Sequential(
Linear(1, 5, bias=True),
ReLU(),
Linear(5, 1, bias=True)
)
num_epochs = 1000
sgd_params = {
'learning_rate': 1e-3,
'weight_decay': 0.0,
'schedule_scale': 1.0,
}
history = {
'train': {
'loss': [],
'epoch': [],
},
'validation': {
'loss': [],
'epoch': [],
}
}
for epoch in range(num_epochs):
with TrainingContext(model, criterion) as tc:
# Forward pass
y_hat = model(X)
loss = criterion(y_hat, y)
# Backward pass in reverse order
dL = criterion.backward(loss)
model.backward(dL)
# Update gradients
model.update(**sgd_params)
criterion.update(**sgd_params)
# Scheduler
sgd_params['learning_rate'] = sgd_params['learning_rate'] * sgd_params['schedule_scale']
history['train']['epoch'].append(epoch)
history['train']['loss'].append(loss)
# Validation
y_hat = model(X_validation)
loss = criterion(y_hat, y_validation)
history['validation']['epoch'].append(epoch)
history['validation']['loss'].append(loss)
# Tracking
if (epoch+1) % 100 == 0:
print(f'{epoch+1} / {num_epochs}: Training: {history["train"]["loss"][-1]:.2e} Validation: {history["validation"]["loss"][-1]:.2e}')
plt.plot(history['train']['epoch'], history['train']['loss'], label='Training')
plt.plot(history['validation']['epoch'], history['validation']['loss'], label='Validation')
plt.legend()
If I change the model to
model = Sequential(
Linear(1, 1, bias=True),
)
Modules definition
Base module, Sequential wrapper, and Training context manager
class Module:
def __init__(self):
self._save_for_backward = {}
self._grad = {}
self.is_training = False # Don't set this manually
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
def zero_grad(self):
# print(f'===> DEBUG: zero_grad')
if not self.is_training:
raise RuntimeError('Please run zero_grad inside the training context')
self._grad = {}
def reset(self):
self.zero_grad()
self._save_for_backward = {}
def save_for_backward(self, *args, **kwargs):
r'''Saves or retrieves anything needed for training
- If called with a positional argument ==> Returns saved value
- If called with a keyword argument (with value assignment) ==> Saves the value
'''
if len(args) > 0 and len(kwargs) > 0:
raise ValueError(f'Cannot save for backward and retrieve at the same time')
elif len(args) == 0 and len(kwargs) == 0:
return self._save_for_backward
elif len(args) > 0:
result = []
for arg in args:
result.append(self._save_for_backward[arg])
if len(result) == 1:
return result[0]
else:
return result
elif self.is_training:
for key, value in kwargs.items():
self._save_for_backward[key] = value
return None
def update(self, *args, **kwargs):
pass
class Sequential(Module):
def __init__(self, *modules):
self.modules = modules
super().__init__()
def forward(self, X, *args, **kwargs):
for mod in self.modules:
X = mod(X)
return X
def backward(self, dLdy):
grad = dLdy
# print(grad.shape)
for mod in self.modules[::-1]:
grad = mod.backward(grad)
# print(grad.shape)
return grad
def update(self, *args, **kwargs):
for mod in self.modules:
mod.update(*args, **kwargs)
@property
def is_training(self):
is_training = []
for mod in self.modules:
is_training.append(mod.is_training)
return is_training
@is_training.setter
def is_training(self, value):
if not isinstance(value, (list, tuple)):
value = [value] * len(self.modules)
for idx, mod in enumerate(self.modules):
mod.is_training = value[idx]
class TrainingContext:
r'''Makes sure the modules are in the training mode
Usage:
with TrainingContext(layer1, layer2, loss) as tc:
...
'''
def __init__(self, *modules, reset_on_exit=False):
self.modules = modules
self.old_states = []
self.reset_on_exit = reset_on_exit
def __enter__(self):
for mod in self.modules:
self.old_states.append(mod.is_training)
mod.is_training = True
def __exit__(self, *args, **kwargs):
for idx, mod in enumerate(self.modules):
mod.is_training = self.old_states[idx]
if self.reset_on_exit:
mod.reset()
MSE Loss and ReLU
class MSELoss(Module):
def forward(self, y_hat, y):
diff = y_hat - y
self.save_for_backward(diff=diff, k=len(y))
diff_sq = diff * diff
return 0.5 * diff_sq.mean()
def backward(self, loss):
diff = self.save_for_backward('diff')
k = self.save_for_backward('k')
self._grad['loss'] = self._grad.get('loss', np.zeros_like(diff))
self._grad['loss'] += diff / k
return self._grad['loss']
class ReLU(Module):
def forward(self, X):
zeromask = X <= 0.0
self.save_for_backward(zeromask=zeromask)
y = X.copy()
y[zeromask] = 0.0
return y
def backward(self, dLdy):
dLdX = dLdy.copy()
zeromask = self.save_for_backward('zeromask')
dLdX[zeromask] = 0.0
return dLdX
Linear Layer
class Linear(Module):
def __init__(self, Cin, Cout, bias=True):
super().__init__()
self.Cin = Cin
self.Cout = Cout
self.weight = np.random.randn(Cin, Cout)
self.bias = np.zeros(Cout) if bias else None
def forward(self, X):
# print(f'===> DEBUG: forward')
if X.ndim == 1:
X = X.reshape(-1, 1)
if self.is_training:
self.save_for_backward(X=X.copy())
y = X @ self.weight
return y
def backward(self, dLdy):
# dLdy.shape = N x Cout
# dydw.shape = N x Cin
# print(f'===> DEBUG: backward')
if not self.is_training:
raise RuntimeError('Please run backward inside the training context')
dydX = self.weight.T
dLdX = dLdy @ dydX
dydw = self.save_for_backward('X')
self._grad['weight'] = self._grad.get('weight', np.zeros_like(self.weight))
self._grad['weight'] += dydw.T @ dLdy
if self.bias is not None:
self._grad['bias'] = self._grad.get('bias', np.zeros_like(self.bias))
self._grad['bias'] += dLdy.sum(0)
return dLdX
def update(self, learning_rate=1e-3, weight_decay=1e-4, zero_grad=True, *args, **kwargs):
# print(f'===> DEBUG: update')
if not self.is_training:
raise RuntimeError('Please run update inside the training context')
self.weight -= learning_rate * (self._grad['weight'] + weight_decay * self.weight)
if self.bias is not None:
self.bias -= learning_rate * (self._grad['bias'] + weight_decay * self.bias)
if zero_grad:
self.zero_grad()


