I am using Google Vision Api to extract the text from a cheque image. It is extracting the all the data but not in structured form. It extracts text using the text hirarchy technique. the main issue i am getting is that when use the code given by the google vision api, it print the data in different and weird structure. when i use vision api free trail on the website, it print the data in a good structure. Secondly, block, paragraph and words structure should have different output but the code is printing the same structure in all format(block, paragraph and words). Can anyone help me out. Here is the code that i am using:
from enum import Enum
from google.cloud import vision
from PIL import Image, ImageDraw
import os
##api key
class FeatureType(Enum):
PAGE = 1
BLOCK = 2
PARA = 3
WORD = 4
SYMBOL = 5
def draw_boxes(image, bounds, color):
draw = ImageDraw.Draw(image)
for bound in bounds:
draw.polygon(
[
bound.vertices[0].x,
bound.vertices[0].y,
bound.vertices[1].x,
bound.vertices[1].y,
bound.vertices[2].x,
bound.vertices[2].y,
bound.vertices[3].x,
bound.vertices[3].y,
],
None,
color,
)
return image
def get_document_bounds(image_file, feature):
client = vision.ImageAnnotatorClient()
bounds = []
with open(image_file, "rb") as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client.document_text_detection(image=image)
document = response.full_text_annotation
recognized_text = ""
# Collect specified feature bounds by enumerating all document features
# Collect specified feature bounds by enumerating all document features
for page in document.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
paragraph_text = ""
for word in paragraph.words:
word_text = ""
for symbol in word.symbols:
if feature == FeatureType.SYMBOL:
bounds.append(symbol.bounding_box)
recognized_text += symbol.text
if feature == FeatureType.WORD:
bounds.append(word.bounding_box)
word_text += symbol.text
paragraph_text += symbol.text
if feature == FeatureType.WORD:
recognized_text += word_text + " "
if feature == FeatureType.PARA:
bounds.append(paragraph.bounding_box)
recognized_text += paragraph_text + " "
if feature == FeatureType.BLOCK:
bounds.append(block.bounding_box)
recognized_text += paragraph_text + " "
# The list `bounds` contains the coordinates of the bounding boxes.
return bounds, recognized_text
def render_doc_text(filein):
image = Image.open(filein)
bounds, recognized_text = get_document_bounds(filein, FeatureType.BLOCK)
draw_boxes(image, bounds, "blue")
print("Extracted Text from BLOCKs:", recognized_text)
bounds, recognized_text = get_document_bounds(filein, FeatureType.PARA)
draw_boxes(image, bounds, "red")
print("Extracted Text from PARAs:", recognized_text)
bounds, recognized_text = get_document_bounds(filein, FeatureType.WORD)
draw_boxes(image, bounds, "yellow")
print("Extracted Text from WORDs:", recognized_text)
image.show()
if __name__ == "__main__":
input_image_path = r'image path'
render_doc_text(input_image_path)
I am trying to get the output in a good structure.