PDF Sharp - HTML to Formatted PDF

43 Views Asked by At

I am trying to export HTML text content to a PDF.

Below is an example of my code

class Program
{
    static void Main(string[] args)
    {
        // Create a new PDF document
        PdfDocument document = new PdfDocument();
        PdfPage page = document.AddPage();
        XGraphics gfx = XGraphics.FromPdfPage(page);

        double x = 40;
        double y = 40;
        double lineHeight = 15;

        string html =  "   <p>This is sample text.</p><ol><li>I need to normalise</li><li>I also need to normalise this too</li></ol><p>this will be an unordered list</p><ul><li>one</li><li>two</li></ul><p><b>This is bold</b></p><p><i>This is italic</i>  <  /p><p><u>This is <b>underlined</b></u></p>";

        var elements = ParseHtmlContent(html);
        
         RenderElements(gfx, elements, ref x, ref y, lineHeight);

        string directoryPath = @"C:\temp";
        if (!System.IO.Directory.Exists(directoryPath))
        {
            System.IO.Directory.CreateDirectory(directoryPath);
        }

        string filename = System.IO.Path.Combine(directoryPath, "FormattedText.pdf");
        document.Save(filename);
        Console.WriteLine($"PDF saved to {filename}");
    }

    static List<HtmlElement> ParseHtmlContent(string html)
    {
        var elements = new List<HtmlElement>();
        var regex = new Regex(@"<(?<tag>\w+)(?:\s*[^>]*)?>(?<content>.*?)<\/\k<tag>>", RegexOptions.Singleline);

        var matches = regex.Matches(html);

        foreach (Match match in matches)
        {
            string tag = match.Groups["tag"].Value;
            string content = match.Groups["content"].Value.Trim();

            if (tag == "ol" || tag == "ul")
            {
                var listItems = ParseListItems(content, tag);
                elements.AddRange(listItems);
            }
            else if (tag == "p")
            {
                // Check for nested tags within the paragraph
                var nestedMatches = regex.Matches(content);
                if (nestedMatches.Count > 0)
                {
                    // If nested tags exist, parse the nested content
                    var nestedElements = ParseHtmlContent(content);
                    elements.Add(new HtmlElement { Tag = tag, Children = nestedElements });
                }
                else
                {
                    // If no nested tags, add the paragraph content directly
                    elements.Add(new HtmlElement { Tag = tag, Value = content });
                }
            }
            else if (tag == "b" || tag == "i" || tag == "u")
            {
                // Directly add bold or italic tags and their content
                elements.Add(new HtmlElement { Tag = tag, Value = content });
            }
            // Other tags like <li> are handled by the list parsing logic
        }

        return elements;
    }


    static List<HtmlElement> ParseListItems(string listHtml, string listType)
    {
        var items = new List<HtmlElement>();
        var regex = new Regex(@"<li>(?<content>.*?)<\/li>", RegexOptions.Singleline);
        var matches = regex.Matches(listHtml);
        int counter = 1;

        foreach (Match match in matches)
        {
            string content = match.Groups["content"].Value.Trim();
            string formattedContent = listType == "ol" ? $"{counter}. {content}" : $"• {content}";

            items.Add(new HtmlElement { Tag = "li", Value = formattedContent });

            if (listType == "ol") counter++;
        }

        return items;
    }

    static void RenderElements(XGraphics gfx, List<HtmlElement> elements, ref double x, ref double y, double lineHeight)
    {
        foreach (var element in elements)
        {
            XFont font = element.Tag switch
            {
                "p" => new XFont("Verdana", 10, XFontStyle.Regular),
                "b" => new XFont("Verdana", 10, XFontStyle.Bold),
                "i" => new XFont("Verdana", 10, XFontStyle.Italic),
                "u" => new XFont("Verdana", 10, XFontStyle.Underline),
                "li" => new XFont("Verdana", 10, XFontStyle.Regular),
                _ => new XFont("Verdana", 10, XFontStyle.Regular)
            };

            if (element.Tag == "li" && element.Value.StartsWith("•"))
            {
                // Adjust for bullet points
                gfx.DrawString(element.Value, font, XBrushes.Black, x + 20, y);
            }
            else if (element.Tag == "li")
            {
                // Adjust for numbered list items
                gfx.DrawString(element.Value, font, XBrushes.Black, x + 20, y);
            }
            else
            {
                gfx.DrawString(element.Value ?? string.Empty, font, XBrushes.Black, x, y); // Use an empty string if Value is null
            }

            // If the element has children, render them recursively
            if (element.Children.Any())
            {
                RenderElements(gfx, element.Children, ref x, ref y, lineHeight);
            }

            y += lineHeight;
        }
    }
}


class HtmlElement
{
    public string Tag { get; set; }
    public string Value { get; set; }
    public List<HtmlElement> Children { get; set; } = new List<HtmlElement>();
}

The challenge that I am facing is when I have a further nested tag like in the html object it is written like this

This is <b>underlined</b>

My question here is how do I the word(s) \ phrase to be bold in the middle of the line as it would have been in the browser

0

There are 0 best solutions below