refactored the iitnet code but it converges much more slowly slowly than the original code

25 Views Asked by At

I have refactored the code, but it converges much more slowly than the original code. I don't know what details I might have missed.

1.githut original code page:https://github.com/gist-ailab/IITNet-official 2.my code:

enter image description here

def setProjectPath(projectPath):
    import os
    import sys
    root=os.path.abspath(projectPath)
    print(root)
    sys.path.append(root)
    
    os.environ["TORCH_HOME"]=r"E:\Data\torch-model"
    os.environ["KMP_DUPLICATE_LIB_OK"]='TRUE'
    import warnings  
    warnings.filterwarnings("ignore", category=RuntimeWarning, module="mne")
projectPath=r'/mount/mount_project/test'
setProjectPath(projectPath)

iitnet_config_10={
    "max_epochs": 500,
    "dataset": "Sleep-EDF",
    "signal_type": "Fpz-Cz",
    "sampling_rate": 100,
    "seq_len": 10,
    "target_idx": -1,
    "n_splits": 20,

    "hidden_dim": 128,
    "batch_size": 256,
    "patience": 10,
    "num_layers": 50,
    "dropout_rate": 0.5,
    "num_classes": 5,
    "early_stopping_mode": "min",
    "bidirectional": True,
    "learning_rate": 0.005,
    "weight_decay": 0.000001
}
iitnet_config_01={
    "max_epochs": 500,
    "dataset": "Sleep-EDF",
    "signal_type": "Fpz-Cz",
    "sampling_rate": 100,
    "seq_len": 1,
    
    "target_idx": -1,
    "n_splits": 20,

    "hidden_dim": 128,
    "batch_size": 256,
    "patience": 10,
    "num_layers": 50,#resnet_18,34,50,101,152
    "dropout_rate": 0.5,
    "num_classes": 5,
    "early_stopping_mode": "min",
    "bidirectional": True,
    "learning_rate": 0.005,
    "weight_decay": 0.000001
}


from tqdm import tqdm

from project.dataset import loader
from torch.utils.data import *

train_dataset=loader.EEGDataLoader(iitnet_config_01,1)
train_dataloader=DataLoader(dataset=train_dataset,batch_size=128)

val_dataset=loader.EEGDataLoader(iitnet_config_01,1,mode='val')
val_dataloader=DataLoader(dataset=val_dataset,batch_size=128)
import project.model.iitnet.models.main_models as iitnet
import torch
model=iitnet.MainModel(config=iitnet_config_01)
model = torch.nn.DataParallel(model, device_ids=list(range(len('0'.split(",")))))
optimizer = torch.optim.Adam(model.parameters(),lr=iitnet_config_01['learning_rate'],weight_decay=iitnet_config_01['weight_decay'])
loss_fn=torch.nn.CrossEntropyLoss()
device=torch.device('cuda')
model.to(device)

for j in range(100):
    model.train()
    with tqdm(enumerate(train_dataloader),total=len(train_dataloader)) as t:
        correct, total, train_loss = 0, 0, 0
        for i,batch_data in t:
            
            x,y=batch_data
            x=x.to(device)
            y=y.to(device)
            total += y.size(0)
            y_hat=model(x)
            loss=loss_fn(y_hat,y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            predicted=torch.argmax(y_hat,1)
            correct+=predicted.eq(y).sum().item()
            t.set_description_str(f'第{j}epoch')
            t.set_postfix_str('Loss: %.3f | TRAIN_Acc: %.3f%% (%d/%d)'
                            % (train_loss / (i + 1), 100. * correct / total, correct, total))
    model.eval()
    with tqdm(enumerate(val_dataloader),total=len(val_dataloader)) as t:
        correct, total, val_loss = 0, 0, 0
        for i,batch_data in t:
            x,y=batch_data
            x=x.to(device)
            y=y.to(device)
            total += y.size(0)
            y_hat=model(x)
            loss=loss_fn(y_hat,y)
            val_loss += loss.item()
            predicted=torch.argmax(y_hat,1)
            correct+=predicted.eq(y).sum().item()
            t.set_description_str(f'第{j}epoch')
            t.set_postfix_str('Loss: %.3f | EVAL_Acc: %.3f%% (%d/%d)'
                            % (train_loss / (i + 1), 100. * correct / total, correct, total))

I want to know what detail I miss that make my code converges much more slowly than the original code

0

There are 0 best solutions below