Removing Documents from a Google Document AI Processor Dataset in Python

145 Views Asked by At

I am currently working on a project involving Google Document AI, and I need assistance with removing documents from a Processor dataset using Python. I have tried various approaches but haven't been able to find a solution.

Here's what I have done so far:

  • I have set up a Processor in Google Document AI.
  • I have successfully uploaded documents to the Processor dataset.
  • Now, I need to remove specific documents from the dataset programmatically using Python.
  • I have reviewed the Google Cloud Document AI documentation (in particular this part), but I couldn't find any clear guidance on how to achieve the deletion.

The code we used is the follwoing:

import io

from google.cloud import documentai_v1beta3
from google.cloud.documentai_v1beta3 import DocumentId

PROCESSOR_LOCATION = "eu"
PROJECT_NUMBER = {
    "DEV": "123456789",
    "PROD": "123456789"
}
PROCESSOR_ID_CDE = {
    "DEV": "abcdefghijk",
    "PROD": "abcdefghijk"
}

# Create DocumentId objects and set the gcs_managed_doc_id attribute
doc_id1 = DocumentId(document_id="gs://test/raw_data/training/abc.pdf")
doc_id2 = DocumentId(document_id="gs://test/raw_data/training/xyz.pdf")

ENV_DEST = "DEV"


def sample_batch_delete_documents():
    # Create a client
    opts = {"api_endpoint": "eu-documentai.googleapis.com"}
    client = documentai_v1beta3.DocumentServiceClient(client_options=opts)

    # Initialize request argument(s)
    dataset_documents = documentai_v1beta3.BatchDatasetDocuments.IndividualDocumentIds(document_ids=[doc_id1, doc_id2])
    batch_dataset_documents = documentai_v1beta3.BatchDatasetDocuments()
    batch_dataset_documents.individual_document_ids = dataset_documents
    request = documentai_v1beta3.BatchDeleteDocumentsRequest(
        dataset=f"projects/{PROJECT_NUMBER[ENV_DEST]}/locations/{PROCESSOR_LOCATION}/processors/{PROCESSOR_ID_CDE[ENV_DEST]}/dataset",
        dataset_documents=batch_dataset_documents,
        timeout=300  # 300 seconds
    )
    print(request)

    # Make the request
    operation = client.batch_delete_documents(request=request)
    print("Waiting for operation to complete...")
    response = operation.result()

    # Handle the response
    print(response)

The error message that I get is the following:

Error Message
 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/tmp/ipykernel_13116/441287119.py in <module>
----> 1 sample_batch_delete_documents()
 
/var/tmp/ipykernel_13116/2850527411.py in sample_batch_delete_documents()
    39     print("Waiting for operation to complete...")
    40
---> 41     response =operation.result()
    42 
    43     # Handle the responseh
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in result(self, timeout, retry, polling)
   254         """
   255
--> 256         self._blocking_poll(timeout=timeout,retry=retry,polling=polling)
   257
   258         ifself._exception isnotNone:
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _blocking_poll(self, timeout, retry, polling)
   135
   136         try:
--> 137             polling(self._done_or_raise)(retry=retry)
   138         exceptexceptions.RetryError:
   139             raise concurrent.futures.TimeoutError(
 
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_wrapped_func(*args, **kwargs)
   352                 sleep_generator,
   353                 self._timeout,
--> 354                 on_error=on_error,
   355             )
   356
 
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_target(target, predicate, sleep_generator, timeout, on_error, **kwargs)
   189     forsleep insleep_generator:
   190         try:
--> 191             return target()
   192
   193         # pylint: disable=broad-except
 
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _done_or_raise(self, retry)
   117     def_done_or_raise(self,retry=None):
   118         """Check if the future is done and raise if it's not."""
--> 119         if notself.done(retry=retry):
   120             raise_OperationNotComplete()
   121
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in done(self, retry)
   172             bool:Trueifthe operation iscomplete,Falseotherwise.
   173         """
--> 174         self._refresh_and_update(retry)
   175         returnself._operation.done
   176
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _refresh_and_update(self, retry)
   161         ifnotself._operation.done:
   162             self._operation =self._refresh(retry=retry)ifretry elseself._refresh()
--> 163             self._set_result_from_operation()
   164
   165     defdone(self,retry=None):
 
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _set_result_from_operation(self)
   133             ifself._operation.HasField("response"):
   134                 response = protobuf_helpers.from_any_pb(
--> 135                     self._result_type,self._operation.response
   136                 )
   137                 self.set_result(response)
 
/opt/conda/lib/python3.7/site-packages/google/api_core/protobuf_helpers.py in from_any_pb(pb_type, any_pb)
    65         raise TypeError(
    66             "Could not convert {} to {}".format(
---> 67                 any_pb.__class__.__name__,pb_type.__name__
    68             )
    69         )
 
TypeError: Could not convert Any to BatchDeleteDocumentsResponse

I would greatly appreciate it if someone could provide me with guidance on how to approach this. I am open to suggestions on what I already tried or also any other method.

0

There are 0 best solutions below