Regex for HTML outside markdown code block

534 Views Asked by At

I have a textarea containing some markdown. I do not want users to post html in it, unless it is inside the markdown code block like

``` someLanguageCode
<span>some html inside markdown code block</span>
```

I do not want to allow any html outside the markdown code block. So this would be illegal:

<span>some html tag outside code block</span>
<div>some more multiline html code outside
</div>
``` someLanguageCode
<span>some html inside markdown code block</span>
```

I was able to get a regex for single line html tags. <([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>(.*?)<\/\1>

I am unable to

  1. get a regex that supports multi line html tags and
  2. to check whether that html is outside markdown code block.

I've made a jsfiddle to play around with this problem which shows what should match or should be rejected.

I'm doing this as an attempt to avoid obvious XSS injections.

1

There are 1 best solutions below

1
On BEST ANSWER

As it was already mentioned in a comment, you shouldn't try to parse the whole HTML with a regex. I think you just want to strip the tags in the end and mark it as not valid. I created a jsfiddle where I put some code that parses the structure and gives you the possibility to apply your code in the markdown area or outside:

var valid = '``` someLanguageCode'+
'<span>some html inside markdown code block</span>'+
'```'; // Valid string
var broken = '``` someLanguageCode'+
'<span>some html inside markdown code block</span>'; //Markdown not closed (broken string)
var not_valid = '<span>Me is outside.</span>'+
'``` someLanguageCode'+
'<span>some html inside markdown code block</span>'+
'```'; // Not valid string

var s = not_valid; //Change this to test

document.getElementById('code').innerHTML = check_html_in_markdown(s);

function check_html_in_markdown(s){
    s = s.split(/```/);
  //Check if markdown blocks are closed correctly
  var is_broken = false;
  if(s.length % 2 == 0){ //odd number of markdown ``` means not closed
    is_broken = true;
    alert('Markown is broken');
  }

  if(!is_broken){
    var in_markdown = false; 
    for(var i in s){
      in_markdown = i % 2 == 1;
      if(!in_markdown){
        //Code to find HTML-Tags and replace them
        s[i] = s[i].replace(/<[a-z\/][^>]*>/g, ' **Your replacement** ');
      } else {
        //Here you can do nothing or check with a HTML-Parser if there is valied HTML
      }
    }
  }
  return s.join('```');
}