import os
import re
from tika import parser
folder = ".../test/input"
pattern = r'^(\d+)\s+(\S+)\s+([0-9.]+ x [0-9]+ x [A-Z]+)\s+([0-9.]+)\s+([0-9.]+)'
data = [] # List to store extracted data
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
parsed_pdf = parser.from_file(file_path)
if 'content' in parsed_pdf:
text = parsed_pdf['content']
matches = re.findall(pattern, text, re.MULTILINE)
for match in matches:
data.append(match) # Append the extracted row data to the list
else:
print(f"Text extraction failed for file: {file_path}")
# Print the extracted data
for row in data:
print(row)
I want to extract some data from a table in my pdf files, but some table has a extra column of data 'Quantity', how do I handle the conditions?
First type of data :
('57', '231228B23', '0.21 x 914 x C', '2.640', '2.680')
('58', '231228B24', '0.21 x 914 x C', '2.682', '2.722')
('59', '231228B25', '0.21 x 914 x C', '2.710', '2.750')
('60', '231228B26', '0.21 x 914 x C', '2.714', '2.754')
('61', '231228B27', '0.21 x 914 x C', '2.636', '2.676')
('62', '231228B28', '0.21 x 914 x C', '2.628', '2.668')
('63', '231228B29', '0.21 x 914 x C', '2.628', '2.668')
('64', '231228A37', '0.21 x 914 x C', '2.684', '2.724')
('65', '231228A38', '0.21 x 914 x C', '2.718', '2.758')
('66', '231228A39', '0.21 x 914 x C', '2.646', '2.686')
('67', '231228A40', '0.21 x 914 x C', '2.652', '2.692')
Second type of data :
('7', '231228B25', '0.21 x 914 x C', '1', '2.710', '2.750')
('8', '231228B26', '0.21 x 914 x C', '1', '2.714', '2.754')
('9', '231228B27', '0.21 x 914 x C', '1', '2.636', '2.676')
('10', '231228B28', '0.21 x 914 x C', '1', '2.628', '2.668')
('11', '231228B29', '0.21 x 914 x C', '1', '2.628', '2.668')
('12', '231228A37', '0.21 x 914 x C', '1', '2.684', '2.724')
I do not need the Quantity column.