I am currently working on a project involving Google Document AI, and I need assistance with removing documents from a Processor dataset using Python. I have tried various approaches but haven't been able to find a solution.
Here's what I have done so far:
- I have set up a Processor in Google Document AI.
- I have successfully uploaded documents to the Processor dataset.
- Now, I need to remove specific documents from the dataset programmatically using Python.
- I have reviewed the Google Cloud Document AI documentation (in particular this part), but I couldn't find any clear guidance on how to achieve the deletion.
The code we used is the follwoing:
import io
from google.cloud import documentai_v1beta3
from google.cloud.documentai_v1beta3 import DocumentId
PROCESSOR_LOCATION = "eu"
PROJECT_NUMBER = {
"DEV": "123456789",
"PROD": "123456789"
}
PROCESSOR_ID_CDE = {
"DEV": "abcdefghijk",
"PROD": "abcdefghijk"
}
# Create DocumentId objects and set the gcs_managed_doc_id attribute
doc_id1 = DocumentId(document_id="gs://test/raw_data/training/abc.pdf")
doc_id2 = DocumentId(document_id="gs://test/raw_data/training/xyz.pdf")
ENV_DEST = "DEV"
def sample_batch_delete_documents():
# Create a client
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
client = documentai_v1beta3.DocumentServiceClient(client_options=opts)
# Initialize request argument(s)
dataset_documents = documentai_v1beta3.BatchDatasetDocuments.IndividualDocumentIds(document_ids=[doc_id1, doc_id2])
batch_dataset_documents = documentai_v1beta3.BatchDatasetDocuments()
batch_dataset_documents.individual_document_ids = dataset_documents
request = documentai_v1beta3.BatchDeleteDocumentsRequest(
dataset=f"projects/{PROJECT_NUMBER[ENV_DEST]}/locations/{PROCESSOR_LOCATION}/processors/{PROCESSOR_ID_CDE[ENV_DEST]}/dataset",
dataset_documents=batch_dataset_documents,
timeout=300 # 300 seconds
)
print(request)
# Make the request
operation = client.batch_delete_documents(request=request)
print("Waiting for operation to complete...")
response = operation.result()
# Handle the response
print(response)
The error message that I get is the following:
Error Message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/tmp/ipykernel_13116/441287119.py in <module>
----> 1 sample_batch_delete_documents()
/var/tmp/ipykernel_13116/2850527411.py in sample_batch_delete_documents()
39 print("Waiting for operation to complete...")
40
---> 41 response =operation.result()
42
43 # Handle the responseh
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in result(self, timeout, retry, polling)
254 """
255
--> 256 self._blocking_poll(timeout=timeout,retry=retry,polling=polling)
257
258 ifself._exception isnotNone:
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _blocking_poll(self, timeout, retry, polling)
135
136 try:
--> 137 polling(self._done_or_raise)(retry=retry)
138 exceptexceptions.RetryError:
139 raise concurrent.futures.TimeoutError(
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_wrapped_func(*args, **kwargs)
352 sleep_generator,
353 self._timeout,
--> 354 on_error=on_error,
355 )
356
/opt/conda/lib/python3.7/site-packages/google/api_core/retry.py in retry_target(target, predicate, sleep_generator, timeout, on_error, **kwargs)
189 forsleep insleep_generator:
190 try:
--> 191 return target()
192
193 # pylint: disable=broad-except
/opt/conda/lib/python3.7/site-packages/google/api_core/future/polling.py in _done_or_raise(self, retry)
117 def_done_or_raise(self,retry=None):
118 """Check if the future is done and raise if it's not."""
--> 119 if notself.done(retry=retry):
120 raise_OperationNotComplete()
121
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in done(self, retry)
172 bool:Trueifthe operation iscomplete,Falseotherwise.
173 """
--> 174 self._refresh_and_update(retry)
175 returnself._operation.done
176
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _refresh_and_update(self, retry)
161 ifnotself._operation.done:
162 self._operation =self._refresh(retry=retry)ifretry elseself._refresh()
--> 163 self._set_result_from_operation()
164
165 defdone(self,retry=None):
/opt/conda/lib/python3.7/site-packages/google/api_core/operation.py in _set_result_from_operation(self)
133 ifself._operation.HasField("response"):
134 response = protobuf_helpers.from_any_pb(
--> 135 self._result_type,self._operation.response
136 )
137 self.set_result(response)
/opt/conda/lib/python3.7/site-packages/google/api_core/protobuf_helpers.py in from_any_pb(pb_type, any_pb)
65 raise TypeError(
66 "Could not convert {} to {}".format(
---> 67 any_pb.__class__.__name__,pb_type.__name__
68 )
69 )
TypeError: Could not convert Any to BatchDeleteDocumentsResponse
I would greatly appreciate it if someone could provide me with guidance on how to approach this. I am open to suggestions on what I already tried or also any other method.