Unusual results when using Bert model for binary text classification and cross validation

94 Views Asked by At

I'm working on binary text classification task with several pretrained model (bert, bigbird etc) along with cross validation with KFold. The code works but the result is kinda odd.

Take Bert for example, the accuracy is around 48 and 50, F1-score is around 65~67 (sometime 0), but no matter how I change the hyper parameter, the Kfold accuracy only comes with three numbers, 49.97 ,50.00, and 50,03. This issue happen on other model. Is this normal? Hoping to see different Kfold result with different hyperparamenter combination.

So I'm wondering if there's something wrong with the code or the data. The positive and negative example in the data set is exactly the same, around 4000.

from transformers import BertForSequenceClassification, BertConfig, BertModel, BertTokenizer, BigBirdForSequenceClassification

tokenizer1 = BertTokenizer.from_pretrained('bert-base-chinese')

train_dataset, test_dataset = train_test_split(raw_data, test_size=0.2)
train_encodings = tokenizer1(train_dataset['text'].tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
test_encodings = tokenizer1(test_dataset['text'].tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')

train_labels = torch.tensor(train_dataset['label'].tolist())
test_labels = torch.tensor(test_dataset['label'].tolist())

model =BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=OUTPUT_DIM, hidden_dropout_prob=DROPOUT, attention_probs_dropout_prob=DROPOUT)

device = torch.device('cuda')
model.to(device)

train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss().to(device)

for epoch in range(EPOCH):
    model.train()
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch+1, batch_idx * len(input_ids), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item()))

model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(test_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs[0], 1)
        y_true += labels.tolist()
        y_pred += predicted.tolist()
#stratified k-fold cross validation for text classification
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits= K_FOLD, shuffle=True)
skf.get_n_splits(raw_data['text'].tolist(), raw_data['label'].tolist())
fold = 0
for train_index, test_index in skf.split(raw_data['text'].tolist(), raw_data['label'].tolist()):
    fold += 1
    print("Fold:", fold)
    train_dataset = raw_data.iloc[train_index]
    test_dataset = raw_data.iloc[test_index]
    train_encodings = tokenizer1(train_dataset['text'].tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
    test_encodings = tokenizer1(test_dataset['text'].tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
    train_labels = torch.tensor(train_dataset['label'].tolist())
    test_labels = torch.tensor(test_dataset['label'].tolist())

    model =BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=OUTPUT_DIM, hidden_dropout_prob=DROPOUT, attention_probs_dropout_prob=DROPOUT)
    device = torch.device('cuda')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss().to(device)

    train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCH):
        model.train()
        total_loss = 0
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            optimizer.step()


        avg_train_loss = total_loss / len(train_loader)
        # print("Train Loss: {0:.4f}".format(avg_train_loss))

    model.eval()

    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(test_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)

            _, predicted = torch.max(outputs[0], 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    try:
        val_acc = val_correct / val_total
    except ZeroDivisionError:
        val_acc = 0
    print("Validation Accuracy: {0:.4f}".format(val_acc))

Expect different K-fold result after inputting different hyper parameters.

0

There are 0 best solutions below