getting error '"typeerror 'int' object is not callable" in code for NLP

41 Views Asked by At

I am trying to build a model for NLP. The below code is giving error "typeerror 'int' object is not callable" in the last line.

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

from gensim.models import word2vec
from gensim.models import KeyedVectors
from torchtext.vocab import Vectors

from sklearn.metrics import f1_score, precision_score, accuracy_score, mean_squared_error

data = pd.read_table('devel.tsv')
data.head()

data.dtypes

data.shape

data.size

## PREPROCESSING

data.columns=['Words', 'Tags']
data.columns

word = data['Words']
sen=1
Sentence=[]
for letter in word:
 Sentence.append(sen)
 if letter =='.':
  sen+=1
data['Sentence#'] = Sentence
data.head()

data.tail()

word_counts = data.groupby(["Sentence#"])['Words'].agg(['count'])
# word_counts.hist(bins=50)
max_length = word_counts.max()
max_length

## WORD2VEC for embeddings

# !wget https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin

word_vectors = KeyedVectors.load_word2vec_format('BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True, limit=100000)

pad_ukw = np.ones((2, word_vectors.vectors.shape[-1]), dtype=np.int32)

pad_ukw.shape, word_vectors.vectors.shape

data.head()

word_vecs = torch.from_numpy(word_vectors.vectors)
word_vecs = word_vecs.int()
values = np.concatenate((pad_ukw, word_vecs), axis=0)
new_val = torch.from_numpy(values)
new_val = new_val.int()
word_vecs.dtype, new_val.dtype

## DATASET AND DATALOADER

no_samples, no_features = data.shape

input_size = no_features
output_size = no_features

print (input_size, output_size)

x = data['Words'].values
y = data['Tags'].values

# Convert x and y to numpy arrays
x = np.array(x)
y = np.array(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state=1 )

print (f"{x_train}, \n {y_train}, \n {x_test} ,\n {y_test}")

train_dataset = TensorDataset(x_train, y_train)

I have tried various solutions such as changing train_dataset to some other variable name, seeing if I can convert to numpy etc. However, it is not getting solved. I am using torch version 2.0.1+cu118 in google collab.

0

There are 0 best solutions below