I've initialized two identical ANN with PyTorch (both as structure and initial parameters), and I've noticed that the hyperparameters setting with Ray Tune, returns different results for the two ANN, even if I didn't have any random initialization.
Someone could explain what I'm doing wrong? I'll attach the code:
- ANN Initialization:
class Featrues_model(nn.Module):
def __init__(self, n_inputs, dim_hidden, n_outputs):
super().__init__()
self.fc1 = nn.Linear(n_inputs, dim_hidden)
self.fc2 = nn.Linear(dim_hidden, n_outputs)
def forward(self, X):
X = self.fc1(X)
X = self.fc2(X)
return X
features_model_v1 = Featrues_model(len(list_input_variables),5,6)
features_model_v2 = Featrues_model(len(list_input_variables),5,6)
features_model_v2.load_state_dict(features_model_v1.state_dict())
- Hyperpamameters setting
config = {
"lr": tune.choice([1e-2, 1e-5]),
"weight_decay": tune.choice([1e-2, 1e-5]),
"batch_size": tune.choice([16,64]),
"epochs": tune.choice([10,50])
}
- Train & Validation Dataframe
trainset = df_final.copy()
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
df_train = df_final.iloc[train_subset.indices]
df_val = df_final.iloc[val_subset.indices]
- Train function design
def setting_model(config, df_train, df_val, model):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
BATCH_SIZE = config["batch_size"]
for epoch in range(config["epochs"]):
train_epoch_loss = 0
train_epoch_acc = 0
step = 0
for i in tqdm(range(0, df_train.shape[0], BATCH_SIZE)):
batch_X = np.array(
df_train[list_input_variables].iloc[i:i+BATCH_SIZE]
)
batch_X = torch.Tensor([x for x in batch_X])
batch_Y = np.array(
df_train[list_output_variables].iloc[i:i+BATCH_SIZE]
)
batch_Y = torch.Tensor([int(y) for y in batch_Y])
batch_Y = batch_Y.type(torch.int64)
optimizer.zero_grad()
outputs = model.forward(batch_X)
train_loss = criterion(outputs, batch_Y)
train_acc = multi_acc(outputs, batch_Y)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
step += 1
# print statistics
print(f"Epochs: {epoch}")
print(f"Train Loss: {train_epoch_loss/len(df_train)}")
print(f"Train Acc: {train_epoch_acc/step}")
print("\n")
# Validation loss
with torch.no_grad():
X_val = np.array(
df_val[list_input_variables]
)
X_val = torch.Tensor([x for x in X_val])
Y_val = np.array(
df_val[list_output_variables]
)
Y_val = torch.Tensor([int(y) for y in Y_val])
Y_val = Y_val.type(torch.int64)
outputs = model.forward(X_val)
_, predicted = torch.max(outputs.data, 1)
total = Y_val.size(0)
correct = (predicted == Y_val).sum().item()
loss = criterion(outputs, Y_val)
tune.report(loss=(loss.numpy()), accuracy=correct / total)
print(f"Validation Loss: {loss.numpy()/len(df_val)}")
print(f"Validation Acc: {correct / total:.3f}")
print("Finished Training")
- Hyperparameters Tune
result_v1 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v1),
config=config,
fail_fast="raise",
)
result_v2 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v2),
config=config,
fail_fast="raise"
)
- Output
result_v1.get_best_config()
{'lr': 1e-05, 'weight_decay': 1e-05, 'epochs': 1}
result_v2.get_best_config()
{'lr': 0.01, 'weight_decay': 1e-05, 'epochs': 1}
The issue is the use of torch.random under the hood. Since you are not directly providing a weight matrix for your layers, pytorch initializes it for you. Luckily, you can have a reproducible experiment by setting
One should use only a few random seeds, otherwise you might overfit on the random seed. See lottery ticket hypothesis at https://arxiv.org/abs/1803.03635)