I am trying to export HTML text content to a PDF.
Below is an example of my code
class Program
{
static void Main(string[] args)
{
// Create a new PDF document
PdfDocument document = new PdfDocument();
PdfPage page = document.AddPage();
XGraphics gfx = XGraphics.FromPdfPage(page);
double x = 40;
double y = 40;
double lineHeight = 15;
string html = " <p>This is sample text.</p><ol><li>I need to normalise</li><li>I also need to normalise this too</li></ol><p>this will be an unordered list</p><ul><li>one</li><li>two</li></ul><p><b>This is bold</b></p><p><i>This is italic</i> < /p><p><u>This is <b>underlined</b></u></p>";
var elements = ParseHtmlContent(html);
RenderElements(gfx, elements, ref x, ref y, lineHeight);
string directoryPath = @"C:\temp";
if (!System.IO.Directory.Exists(directoryPath))
{
System.IO.Directory.CreateDirectory(directoryPath);
}
string filename = System.IO.Path.Combine(directoryPath, "FormattedText.pdf");
document.Save(filename);
Console.WriteLine($"PDF saved to {filename}");
}
static List<HtmlElement> ParseHtmlContent(string html)
{
var elements = new List<HtmlElement>();
var regex = new Regex(@"<(?<tag>\w+)(?:\s*[^>]*)?>(?<content>.*?)<\/\k<tag>>", RegexOptions.Singleline);
var matches = regex.Matches(html);
foreach (Match match in matches)
{
string tag = match.Groups["tag"].Value;
string content = match.Groups["content"].Value.Trim();
if (tag == "ol" || tag == "ul")
{
var listItems = ParseListItems(content, tag);
elements.AddRange(listItems);
}
else if (tag == "p")
{
// Check for nested tags within the paragraph
var nestedMatches = regex.Matches(content);
if (nestedMatches.Count > 0)
{
// If nested tags exist, parse the nested content
var nestedElements = ParseHtmlContent(content);
elements.Add(new HtmlElement { Tag = tag, Children = nestedElements });
}
else
{
// If no nested tags, add the paragraph content directly
elements.Add(new HtmlElement { Tag = tag, Value = content });
}
}
else if (tag == "b" || tag == "i" || tag == "u")
{
// Directly add bold or italic tags and their content
elements.Add(new HtmlElement { Tag = tag, Value = content });
}
// Other tags like <li> are handled by the list parsing logic
}
return elements;
}
static List<HtmlElement> ParseListItems(string listHtml, string listType)
{
var items = new List<HtmlElement>();
var regex = new Regex(@"<li>(?<content>.*?)<\/li>", RegexOptions.Singleline);
var matches = regex.Matches(listHtml);
int counter = 1;
foreach (Match match in matches)
{
string content = match.Groups["content"].Value.Trim();
string formattedContent = listType == "ol" ? $"{counter}. {content}" : $"• {content}";
items.Add(new HtmlElement { Tag = "li", Value = formattedContent });
if (listType == "ol") counter++;
}
return items;
}
static void RenderElements(XGraphics gfx, List<HtmlElement> elements, ref double x, ref double y, double lineHeight)
{
foreach (var element in elements)
{
XFont font = element.Tag switch
{
"p" => new XFont("Verdana", 10, XFontStyle.Regular),
"b" => new XFont("Verdana", 10, XFontStyle.Bold),
"i" => new XFont("Verdana", 10, XFontStyle.Italic),
"u" => new XFont("Verdana", 10, XFontStyle.Underline),
"li" => new XFont("Verdana", 10, XFontStyle.Regular),
_ => new XFont("Verdana", 10, XFontStyle.Regular)
};
if (element.Tag == "li" && element.Value.StartsWith("•"))
{
// Adjust for bullet points
gfx.DrawString(element.Value, font, XBrushes.Black, x + 20, y);
}
else if (element.Tag == "li")
{
// Adjust for numbered list items
gfx.DrawString(element.Value, font, XBrushes.Black, x + 20, y);
}
else
{
gfx.DrawString(element.Value ?? string.Empty, font, XBrushes.Black, x, y); // Use an empty string if Value is null
}
// If the element has children, render them recursively
if (element.Children.Any())
{
RenderElements(gfx, element.Children, ref x, ref y, lineHeight);
}
y += lineHeight;
}
}
}
class HtmlElement
{
public string Tag { get; set; }
public string Value { get; set; }
public List<HtmlElement> Children { get; set; } = new List<HtmlElement>();
}
The challenge that I am facing is when I have a further nested tag like in the html object it is written like this
This is <b>underlined</b>
My question here is how do I the word(s) \ phrase to be bold in the middle of the line as it would have been in the browser