I have a pdf which has this type of table and I need to convert the respective table into html file using python but I am not able to calculate rowspan and colspan attributes and the generated table looks different.
I tried this code but it not able to calculate rowspan and colspan properly. can Camelot able to do this?
import pdfplumber
from bs4 import BeautifulSoup
# Function to convert list of lists to HTML table
def list_to_html_table(data):
html_table = "<table>\n"
for row in data:
html_table += " <tr>\n"
for cell in row:
html_table += cell
html_table += " </tr>\n"
html_table += "</table>"
return html_table
# PDF file path
pdf_file = 'simple_rowspan_colspan.pdf'
# Extract tables from the PDF using pdfplumber
pdf = pdfplumber.open(pdf_file)
html_tables = []
for page in pdf.pages:
page_tables = page.extract_tables()
for table_data in page_tables:
num_rows = len(table_data)
num_cols = max(len(row) for row in table_data)
cell_data = [[''] * num_cols for _ in range(num_rows)]
for i, row in enumerate(table_data):
for j, cell in enumerate(row):
if cell:
rowspan, colspan = 1, 1
# Check for rowspan
for k in range(i + 1, num_rows):
if cell_data[k][j] == '':
rowspan += 1
else:
break
# Check for colspan
for k in range(j + 1, j + colspan):
if k < num_cols:
if cell_data[i][k] == '':
colspan += 1
else:
break
cell_text = f'<td rowspan="{rowspan}" colspan="{colspan}">{cell}</td>'
cell_data[i][j] = cell_text
# Convert the data into an HTML table
html_table = list_to_html_table(cell_data)
html_tables.append(html_table)
# Combine all HTML tables into a single HTML document
output_html = "\n".join(html_tables)
# Parse the HTML to beautify it
soup = BeautifulSoup(output_html, 'html.parser')
pretty_html = soup.prettify()
# Print or save the prettified HTML output as needed
print(pretty_html)
# Save the HTML table to a file or print it as needed
with open('output_table.html', 'w', encoding='utf-8') as html_file:
html_file.write(output_html)
Camelotis be used to extract tables from PDF files, you can give a try on that too. In this codecolspan, it only checks for colspan in the next column (j + 1), but it should check for colspan in the range from j + 1 to j + colspan and also not to exceed no. of columns.