to classify the extracted text from images into spam or non spam using python

27 Views Asked by At

<**

extract text from all the images in a folder

storing the text in a single file

from PIL import Image import pytesseract as pt import pandas as pd from tabulate import tabulate from io import StringIO import os import json import csv

def main(): # path for the folder for getting the raw images path ="E:/mehr mtech p1/images/"

# link to the file in which output needs to be kept
fullTempPath ="E:/mehr mtech p1/out.txt"

# iterating the images inside the folder
for imageName in os.listdir(path):
    inputPath = os.path.join(path, imageName)
    img = Image.open(inputPath)
    #print(imageName)
    
    # applying ocr using pytesseract for python
    pt.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
    text = pt.image_to_string(img, lang ="eng")
    #print(text)
    
    dictionary = {'image': imageName, 'Text': text}
    print(dictionary)
    
    #Create a datafrmae from the dictionary 
    df = pd.DataFrame(dictionary, index=[0])
    #print dataframe.
    #print(df)
    #print(tabulate(df, headers = 'keys', tablefmt = 'psql'))

    #Creating a string of the dictionary to print the data with labels in string format in the txt file
    #string = json.dumps(dictionary)
    #f1 = open("E:/mehr mtech p1/mmyfile.txt","a+")
    #f1.write(string)
    
    #df = pd.read_csv(string, sep =";")
    #print(df)

    df.to_csv("E:/mehr mtech p1/tableimage.csv")
    
    # saving the  text for appending it to the output.txt file
    # a + parameter used for creating the file if not present
    # and if present then append the text content
    file1 = open(fullTempPath, "a+")

    # providing the name of the image
    file1.write(imageName+"\n")

    # providing the content in the image
    file1.write(text+"\n")
    file1.close() 

# for printing the output file
file2 = open(fullTempPath, 'r')
print(file2.read())

file2.close()     

if name == 'main': main()

**>

the extracted text was converted into a dataframe by first converting it into a dictionary. but while converting that dataframe to a csv file and transferring data to excel file..only 1record i.e., only text of 1 image is coming in csv file..what to do now

the dataframe is coming in this way

the dictionary is coming this way

csv file

error

0

There are 0 best solutions below