im using python 3.9 and fast ai = 1.0.58 i tried to implement this code source : https://towardsdatascience.com/fastai-with-transformers-bert-roberta-xlnet-xlm-distilbert-4f41ee18ecb2 but im having error when creating data bunch
train = pd.read_csv('train.tsv.zip', sep="\t")
test = pd.read_csv('test.tsv.zip', sep="\t")
print(train.shape,test.shape)
train.head()
(156060, 4) (66292, 3) PhraseId SentenceId Phrase Sentiment 0 1 1 A series of escapades demonstrating the adage ... 1 1 2 1 A series of escapades demonstrating the adage ... 2 2 3 1 A series 2 3 4 1 A 2 4 5 1 series 2
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id
databunch = (TextList.from_df(train, cols='Phrase', processor=transformer_processor)
.split_by_rand_pct(0.1,seed=seed)
.label_from_df(cols= 'Sentiment')
.add_test(test)
.databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[59], line 1
----> 1 databunch = (TextList.from_df(train, cols='Phrase', processor=transformer_processor) .split_by_rand_pct(0.1,seed=seed)
2 .label_from_df(cols= 'Sentiment')
3 .add_test(test)
4 .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\data_block.py:135, in ItemList.from_df(cls, df, path, cols, processor, **kwargs)
132 @classmethod
133 def from_df(cls, df:DataFrame, path:PathOrStr='.', cols:IntsOrStrs=0, processor:PreProcessors=None, **kwargs)->'ItemList':
134 "Create an `ItemList` in `path` from the inputs in the `cols` of `df`."
--> 135 inputs = df.iloc[:,df_names_to_idx(cols, df)]
136 assert not inputs.isna().any().any(), f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
137 res = cls(items=_maybe_squeeze(inputs.values), path=path, inner_df=df, processor=processor, **kwargs)
AttributeError: module 'fastai.train' has no attribute 'iloc'
i dont know what to import for the module, any solution for this problem?
Update : i tried this
train = pd.read_csv('train.tsv', sep="\t")
test = pd.read_csv('test.tsv', sep="\t")
print(train.shape,test.shape)
train.head()
and run the data bunch and it's running, yes. but i have another error
BrokenProcessPool Traceback (most recent call last)
Cell In[85], line 1
----> 1 databunch = (TextList.from_df(train, cols='Phrase', processor=transformer_processor)
2 .split_by_rand_pct(0.1,seed=seed)
3 .label_from_df(cols= 'Sentiment')
4 .add_test(test)
5 .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\data_block.py:480, in ItemLists.__getattr__.<locals>._inner(*args, **kwargs)
478 self.valid = fv(*args, from_item_lists=True, **kwargs)
479 self.__class__ = LabelLists
--> 480 self.process()
481 return self
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\data_block.py:534, in LabelLists.process(self)
532 "Process the inner datasets."
533 xp,yp = self.get_processors()
--> 534 for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)
535 #progress_bar clear the outputs so in some case warnings issued during processing disappear.
536 for ds in self.lists:
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\data_block.py:714, in LabelList.process(self, xp, yp, name, max_warn_items)
712 p.warns = []
713 self.x,self.y = self.x[~filt],self.y[~filt]
--> 714 self.x.process(xp)
715 return self
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\data_block.py:84, in ItemList.process(self, processor)
82 if processor is not None: self.processor = processor
83 self.processor = listify(self.processor)
---> 84 for p in self.processor: p.process(self)
85 return self
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\text\data.py:297, in TokenizeProcessor.process(self, ds)
295 tokens = []
296 for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):
--> 297 tokens += self.tokenizer.process_all(ds.items[i:i+self.chunksize])
298 ds.items = tokens
File ~\anaconda3\envs\adtree\lib\site-packages\fastai\text\transform.py:120, in Tokenizer.process_all(self, texts)
118 if self.n_cpus <= 1: return self._process_all_1(texts)
119 with ProcessPoolExecutor(self.n_cpus) as e:
--> 120 return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])
File ~\anaconda3\envs\adtree\lib\concurrent\futures\process.py:562, in _chain_from_iterable_of_lists(iterable)
556 def _chain_from_iterable_of_lists(iterable):
557 """
558 Specialized implementation of itertools.chain.from_iterable.
559 Each item in *iterable* should be a list. This function is
560 careful not to keep references to yielded objects.
561 """
--> 562 for element in iterable:
563 element.reverse()
564 while element:
File ~\anaconda3\envs\adtree\lib\concurrent\futures\_base.py:609, in Executor.map.<locals>.result_iterator()
606 while fs:
607 # Careful not to keep a reference to the popped future
608 if timeout is None:
--> 609 yield fs.pop().result()
610 else:
611 yield fs.pop().result(end_time - time.monotonic())
File ~\anaconda3\envs\adtree\lib\concurrent\futures\_base.py:446, in Future.result(self, timeout)
444 raise CancelledError()
445 elif self._state == FINISHED:
--> 446 return self.__get_result()
447 else:
448 raise TimeoutError()
File ~\anaconda3\envs\adtree\lib\concurrent\futures\_base.py:391, in Future.__get_result(self)
389 if self._exception:
390 try:
--> 391 raise self._exception
392 finally:
393 # Break a reference cycle with the exception in self._exception
394 self = None
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.