List items missing when converting HTML to ReportLab

40 Views Asked by At

I am trying to convert some HTML into reportlab syntax so that I can generate a PDF. I have the following code which handles the conversion, but I when giving it nested lists, the item right before the nested list starts gets ommited in the final PDF. I have been trying for hours to fix this and I can't seem to understand why it happens.

import xml.sax as sax
from reportlab.platypus import Paragraph, PageBreak, Table, Preformatted, Image, Spacer
from reportlab.lib.pagesizes import A4
from bs4 import BeautifulSoup
import markdown2
import requests
from io import BytesIO
from PIL import Image as PILImage
import copy

def getColumnWidths(pageSize, pageMargin, columnCount):
    """Calculate the width of each column based on the page margin and number of columns"""
    available_width = pageSize[0] - (pageMargin + pageMargin)
    column_widths = [available_width / columnCount] * columnCount
    return column_widths

def html_to_rl(html, styleSheet, tableStyle, requestHeaders):
    converter = markdown2.Markdown(extras=["tables", "fenced-code-blocks"])
    html = BeautifulSoup(converter.convert(html), "html.parser").prettify()
    print(html)
    elements = list()

    class Handler(sax.ContentHandler):
        mode = ""
        buffer = ""
        listcounter = 0
        nesting_level = 0
        listtype = ""
        in_link = False
        elements = []
        list_stack = []
        in_pre = False  # Flag to indicate if inside a <pre> block (used for code blocks)
        in_code = False
        link_href = ""
        current_table_data = []  # Holds data for the current table being processed
        in_table = False  # Flag to indicate if inside a <table>

        def download_image(self, url, headers):
            """Download an image from the given URL and return the image data."""
            response = requests.get(url, headers=headers, stream=True)
            if response.status_code == 200:
                image_data = BytesIO(response.content)
                return image_data
            else:
                raise Exception("Failed to download image")
        
        def calculate_max_dimensions(self, page_size=A4, left_margin=40, right_margin=40, top_margin=80, bottom_margin=80):
            """Calculate the maximum dimensions available for the image."""
            max_width = page_size[0] - (left_margin + right_margin)
            max_height = page_size[1] - (top_margin + bottom_margin)
            return max_width, max_height

        def adjust_image_size(self, imageData, max_width, max_height):
            """
            Adjust the image size to fit within max_width and max_height while maintaining aspect ratio.
            This will add overhead but without this ReportLab will complain about image being too big to fit within the constraints
            """
            with PILImage.open(imageData) as img:
                original_width, original_height = img.size
                ratio = min(max_width / original_width, max_height / original_height)
                return int(original_width * ratio), int(original_height * ratio)

        def startElement(self, name, attrs):
            if name in ["strong", "em", "i", "b"]:
                self.mode = name
            elif name == "a":
                self.in_link = True
                self.link_href = attrs.get('href', '')
                self.buffer += f'<link href="{self.link_href}">'
            elif name in ["ol", "ul"]:
                # New list context
                list_type = name
                list_counter = 0  # Initialize counter for new list
                self.list_stack.append({"type": list_type, "counter": list_counter})
        
            elif name == "li":
                # Handle list item based on current list context
                print(self.buffer)
                list_context = self.list_stack[-1] if self.list_stack else None
                if list_context:
                    list_type = list_context["type"]
                    list_counter = list_context["counter"] + 1  # Increment counter
                    self.buffer = f"{list_counter}. " if list_type == "ol" else "• "  # Set prefix
                    print(self.buffer)
                    list_context["counter"] = list_counter  # Update counter in stack
            elif name == "hr":
                elements.append(PageBreak())
            elif name == "pre":
                self.in_pre = True
                self.buffer = "" # Clear le buffer
            elif name == "img":
                src = attrs.get('src', '')  # Get image URL
                imageData = self.download_image(src, requestHeaders)
                imageWidth, imageHeight = self.adjust_image_size(imageData, *self.calculate_max_dimensions())
                elements.append(Image(imageData, width=imageWidth, height=imageHeight))
            if name == "table":
                self.in_table = True
                self.current_table_data = []
            elif name == "tr" and self.in_table:
                self.current_table_data.append([])
            elif name in ["td", "th"] and self.in_table:
                self.buffer = ""
            if name == "code" and not self.in_pre:
                self.in_code = True
                self.buffer += "<font name='Courier' size='11' color='#e1665d'>"  # Add opening <code> tag to buffer

        def endElement(self, name):
            if name == "a":
                self.buffer += '</link>'
                self.in_link = False
                self.link_href = ""
            elif name.startswith("h") and name[-1] in ["1", "2", "3", "4", "5", "6"]:
                elements.append(Paragraph(self.buffer, styleSheet["Heading%s" % name[-1]]))
                self.buffer = ""
            elif name in ["strong", "em", "i", "b"]:
                self.mode = ""
            elif name == "p":
                elements.append(Paragraph(self.buffer, styleSheet["BodyText"]))
                self.buffer = ""
            elif name in ["ol", "ul"]:
                # End of list context
                self.list_stack.pop()
        
            elif name == "li":
                # Process list item content
                if len(self.list_stack) > 0:  # Check if nested
                    list_style = copy.deepcopy(styleSheet["BodyText"])
                    indentation = 24 * (len(self.list_stack) - 1)
                    list_style.leftIndent = indentation + 24
                    if self.buffer.strip() == "":
                        self.buffer = ""  # Remove empty text
                    elements.append(Paragraph(self.buffer, list_style))
                    self.buffer = ""
                else:  # Outermost list item
                    list_style = copy.deepcopy(styleSheet["BodyText"])
                    elements.append(Paragraph(self.buffer, list_style))
                    self.buffer = ""
            elif name == "pre":
                self.in_pre = False
                elements.append(Spacer(1, 10))
                elements.append(Preformatted(self.buffer, styleSheet["Code"], maxLineLength=80, newLineChars=''))
                elements.append(Spacer(1, 10))
                self.buffer = ""

            if name == "code" and not self.in_pre:
                self.buffer += "</font>"  # End inline code styling
                self.in_code = False
            
            if name in ["td", "th"] and self.in_table:
                self.current_table_data[-1].append(self.buffer)
                self.buffer = ""  # Reset buffer after capturing cell content

            # Handle table end, create and append ReportLab Table element
            if name == "table" and self.in_table:
                # Convert non-header cell contents to Paragraphs for better formatting
                print(self.current_table_data)
                table_data_formatted = [
                    [cell if idx == 0 else Paragraph(cell, styleSheet["Normal"]) for cell in row]
                    for idx, row in enumerate(self.current_table_data)
                ]
                # Assume table column widths are evenly distributed; adjust as needed
                table_column_widths = getColumnWidths(A4, 40, len(self.current_table_data[0]))
                reportlab_table = Table(table_data_formatted, colWidths=table_column_widths)
                reportlab_table.setStyle(tableStyle)
                elements.append(reportlab_table)
                elements.append(Spacer(1, 10))
                self.in_table = False  # Reset table handling flags


        def characters(self, chars):            
            if self.in_code:
                chars = chars.replace("&", "&amp;")   # Must be done first!
                chars = chars.replace("<", "&lt;")
                chars = chars.replace(">", "&gt;")
                chars = chars.replace('"', "&quot;")
                chars = chars.replace("'", "&apos;")
                chars = chars.rstrip()
            if self.in_table:
                chars = chars.strip() # Remove newlines / trailing spaces from table cells, list items
            if self.mode in ["strong", "em", "i", "b"]:
                chars = f"<{self.mode}>{chars}</{self.mode}>"

            self.buffer += chars

    # Parse the HTML
    sax.parseString(u"<doc>%s</doc>" % html, Handler())

    return elements

This a snippet of my HTML source where I have an ordered list:

<ol>
 <li>
 Step 1
 </li>
 <li>
 Step 2
 <ol>
  <li>
  Multi level step 1
  </li>
  <li>
  Multi level step 2
  </li>
 </ol>
 </li>
 <li>
 Step 3
 </li>
 <li>
 Step 4
 <ol>
  <li>
  Multi level step 1
  </li>
 </ol>
 </li>
 <li>
 Finish
 </li>
</ol>

In the output, I am missing steps 2 and 4, as seen below:

1. Step 1
    1. Multi level step 1
    2. Multi level step 2
3. Step 3
    1. Multi level step 1
5. Finish

enter image description here

0

There are 0 best solutions below