How can i fix a "TypeError: 'BatchEncoding' object is not an iterator" error

39 Views Asked by At

I have exported this huggingface model to onnx: https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english And I came across a blog post where the poster overrided the transformers pipeline() method to accept a .onnx model. https://towardsdatascience.com/nlp-transformers-pipelines-with-onnx-9b890d015723

After running the code i am getting

TypeError: 'BatchEncoding' object is not an iterator

and am not finding much documentation for the error.

This is the full file:

import torch

from onnxruntime import (
    InferenceSession, SessionOptions, GraphOptimizationLevel
)
from transformers import (
    TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification
)
options = SessionOptions() # initialize session options
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

session = InferenceSession(
    "mynewModel.onnx", sess_options=options, providers=["CPUExecutionProvider"]
)

# disable session.run() fallback mechanism, it prevents for a reset of the execution provider
session.disable_fallback() 

class OnnxTokenClassificationPipeline(TokenClassificationPipeline):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    
    def _forward(self, model_inputs):
        """
        Forward pass through the model. This method is not to be called by the user directly and is only used
        by the pipeline to perform the actual predictions.

        This is where we will define the actual process to do inference with the ONNX model and the session created
        before.
        """
        # This comes from the original implementation of the pipeline
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")

        inputs = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()} # dict of numpy arrays
        outputs_name = session.get_outputs()[0].name # get the name of the output tensor

        logits = session.run(output_names=[outputs_name], input_feed=inputs)[0] # run the session
        logits = torch.tensor(logits) # convert to torch tensor to be compatible with the original implementation

        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            **model_inputs,
        }

    # We need to override the preprocess method because the onnx model is waiting for the attention masks as inputs
    # along with the embeddings.
    def preprocess(self, sentence, offset_mapping=None):
        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
        model_inputs = self.tokenizer(
            sentence,
            return_attention_mask=True, # This is the only difference from the original implementation
            return_tensors=self.framework,
            truncation=truncation,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
        )
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping

        model_inputs["sentence"] = sentence

        return model_inputs
      
model_name_from_hub = "dbmdz/bert-large-cased-finetuned-conll03-english"

tokenizer = AutoTokenizer.from_pretrained(model_name_from_hub)
model = AutoModelForTokenClassification.from_pretrained(model_name_from_hub)

onnx_pipeline = OnnxTokenClassificationPipeline(
    task="ner", 
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    aggregation_strategy="simple",
)

mySequence = "JJ is going to go the Mall with his pals later."
onnx_pipeline(mySequence)

Running the model before export gives output in this format with this sentence as input "JJ is going to go the Mall with his pals later.": [{'entity_group': 'PER', 'score': 0.7977358, 'word': 'JJ', 'start': 0, 'end': 2}, {'entity_group': 'LOC', 'score': 0.9910767, 'word': 'Mall', 'start': 22, 'end': 26}].

I am trying to get the same output from my exported .onnx model.

EDIT: Stacktrace:

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Traceback (most recent call last):
  File "c:\Users\L7400\Desktop\MyONNXtest\myprojectname\src\main\java\mygroupid\NewWorkspace\onnxPipeline.py", line 85, in <module>
    onnx_pipeline(mySequence)
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\pipelines\token_classification.py", line 249, in __call__
    return super().__call__(inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\pipelines\base.py", line 1154, in __call__
    return next(
           ^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\pipelines\pt_utils.py", line 124, in __next__
    item = next(self.iterator)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\pipelines\pt_utils.py", line 266, in __next__
    processed = self.infer(next(self.iterator), **self.params)
                           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\dataloader.py", line 630, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\dataloader.py", line 674, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 32, in fetch
    data.append(next(self.dataset_iter))
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\L7400\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\pipelines\pt_utils.py", line 183, in __next__
    processed = next(self.subiterator)
                ^^^^^^^^^^^^^^^^^^^^^^
TypeError: 'BatchEncoding' object is not an iterator
0

There are 0 best solutions below