Parse custom HTML list tags in C# or Java

1.2k Views Asked by At

I have some text like this:

This is a simple line
[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
            [#]This is line 2.2
            [#]This is line 2.3
    and it continues here
        [/olist]
    [#]This is line 3
[/olist]
Another line

How can I parse it in C# into HTML like below

This is a simple line
<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1</li>
            <li>This is line 2.2</li>
            <li>This is line 2.3
    and it continues here</li>
        </ol>
    </li>
    <li>This is line 3</li>
</ol>
Another line

I am currently splitting and concatenating but sub lists are not being handled properly.

UPDATE: - Sample Code

This is what I am currently doing.

var html = ReplaceList(customHtml,"olist","ol");

private static string ReplaceList(string text, string key, string tag)
{
    var itemTmpl = GetListEntry(text, key);
    while (itemTmpl != null)
    {
        var buf = new StringBuilder();
        var arr = itemTmpl.Split(new[] { "[#]" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (var str in arr)
        {
            if (!string.IsNullOrWhiteSpace(str))
                buf.AppendFormat("<li>{0}</li>", str.Trim());
        }

        var content = string.Format("<{0}>{1}</{0}>", tag, buf);


        text = text.SubstringBefore("[" + key + "]") + content +
                        text.SubstringAfter("[/" + key + "]");

        itemTmpl = GetListEntry(text, key);
    }

    return text;
}

private static string GetListEntry(string text, string key)
{
    var tag1 = string.Format("[{0}]", key);
    var tag2 = string.Format("[/{0}]", key);

    var start = text.IndexOf(tag1, StringComparison.Ordinal);
    var end = (start > -1) ? text.IndexOf(tag2, start, StringComparison.Ordinal) : -1;

    if (start < 0 || end <= start)
        return null;

    var result = text.Substring(start + tag1.Length, end - start - tag1.Length);

    return result;
}

Note That Some list items span multiple lines and may also include line breaks

2

There are 2 best solutions below

1
On

You have to parse it to some abstraction tree first, then compose the result from abstraction tree. I.E.:

public interface IElement
{
  void AddElement(IElement element);
  IElement Parent { get; }
}

class OlElement : IElement
{
  public IList<LiElement> Elements { get; set; }
  public IElement Parent { get; set; }

  public OlElement(IElement parent)
  {
    Parent = parent;
    Elements = new List<LiElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((LiElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.AppendLine("<ol>");
    foreach(var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</ol>");
    return builder.ToString();
  }
}

class LiElement : IElement
{
  public string Text { get; set; }
  public IElement Parent { get; set; }
  public IList<OlElement> Elements { get; set; }

  public LiElement(IElement parent, string text)
  {
    Parent = parent;
    Text = text;
    Elements = new List<OlElement>();
  }

  public void AddElement(IElement element)
  {
    Elements.Add((OlElement)element);
  }

  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.Append("<li>");
    builder.Append(Text);
    foreach (var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</li>");
    return builder.ToString();
  }
}

Getting the result:

const string text = @"[olist]
[#]This is line 1
[#]This is line 2
    [olist]
        [#]This is line 2.1
        [#]This is line 2.2
        [#]This is line 2.3
    [/olist]
[#]This is line 3
[/olist]";
var regex = new Regex(@"^\s*\[(?<tag>[^\]]+)\](?<text>.*)$");
var builder = new StringBuilder();
var root = new OlElement(null);
var currentElement = (IElement)root;
using (var reader = new StringReader(text))
{
  string line;
  while ((line = reader.ReadLine()) != null)
  {
    var match = regex.Match(line);
    if (match.Success)
    {
      switch (match.Groups["tag"].Value)
      {
        case "#":
          if (currentElement is OlElement)
          {
            var child = new LiElement(currentElement, match.Groups["text"].Value);
            currentElement.AddElement(child);
            currentElement = child;
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new LiElement(currentElement.Parent, match.Groups["text"].Value);
            currentElement.Parent.AddElement(child);
            currentElement = child;
          }
          break;
        case "olist":
          if (currentElement == root)
          {
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new OlElement(currentElement);
            currentElement.AddElement(child);
            currentElement = child;
          }
          break;
        case "/olist":
          if (currentElement is LiElement)
          {
            currentElement = currentElement.Parent.Parent;
            break;
          }
          if (currentElement is OlElement)
          {
            currentElement = currentElement.Parent;
          }
          break;
        default:
          break;
      }
    }
  }
}
var result = root.ToString();
2
On

Consider following approach (note it is "quick and dirty" in determining tags).

Pretty straightforward - just reading your text line-by-line and converting it (with some look-aheads and counting depths level of sublists).

string src = @"[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
                [olist]
                    [#]This is line 2.1.1
                    [#]This is line 2.1.2
                [/olist]
            [#]This is line 2.2
            [#]This is line 2.3
        [/olist]
    [#]This is line 3
[/olist]";


var sb = new StringBuilder();
var lines = src.Split(new string[] {Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries);
int i = 0;
int innerListsCount = 0;

while (i < lines.Length)
{
    string line = lines[i];
    if (line.EndsWith("[olist]"))
        sb.AppendLine(line.Replace("[olist]", "<ol>"));
    else if (line.EndsWith("[/olist]"))
    {
        sb.AppendLine(line.Replace("[/olist]", "</ol>"));
        if (innerListsCount > 0)
        {
            for (int j = 0; j <= innerListsCount; j++)
                sb.Append("    ");

            sb.AppendLine("</li>");
        }

        innerListsCount--;
    }
    else if (line.Trim().StartsWith("[#]"))
    {
        sb.Append(line.Replace("[#]", "<li>"));

        if (i < lines.Length && lines[i + 1].EndsWith("[olist]"))
        {
            innerListsCount++;
            sb.AppendLine();
        }
        else
            sb.AppendLine("</li>");
    }

    i++;
}

Console.WriteLine(sb.ToString());

Output looks exactly like you want:

<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1
                <ol>
                    <li>This is line 2.1.1</li>
                    <li>This is line 2.1.2</li>
                </ol>
            </li>
            <li>This is line 2.2</li>
            <li>This is line 2.3</li>
        </ol>
        </li>
    <li>This is line 3</li>
</ol>