THIS IS NOT Regular expression to remove HTML tags! Read the question carefuly!
I created a script to convert Cyrillic to Latin and Latin to Cyrillic.
Converting Latin to Cyrillic creates a lot of problems with HTML because the script also converts HTML elements. I created an algorithm that converts Cyrillic HTML to Latin and keeps everything within tags in Cyrillic.
The script works almost well, but either I have a memory problem or the while loop starts to spin indefinitely. Basically, the problem is up to the speed of the script.
public function html_tags() {
$tags = explode(',', '!DOCTYPE,a,abbr,acronym,address,applet,area,article,aside,audio,b,base,basefont,bdi,bdo,big,blockquote,body,br,button,canvas,caption,center,cite,code,col,colgroup,data,details,dd,del,details,dfn,dialog,dir,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,frame,frameset,h1,h2,h3,h4,h5,h6,head,header,hr,html,i,iframe,img,input,ins,kbd,label,legend,li,link,main,map,mark,meta,master,nav,noframes,noscript,object,ol,optgroup,option,output,p,param,picture,pre,progress,q,rp,rt,ruby,s,samp,script,section,select,small,source,span,strike,strong,style,sub,summary,sup,svg,table,tbody,td,template,textarea,tfoot,th,thead,time,title,tr,track,tt,u,ul,var,video,wbr');
$tags = array_map('trim', $tags);
$tags = array_filter($tags);
return apply_filters('serbian_transliteration_html_tags', $tags);
}
public function fix_cyr_html($content){
$content = htmlspecialchars_decode($content);
$tags = $this->html_tags();
$tags_cyr = $tags_lat = array();
foreach($tags as $tag){
$tags_cyr[]='<' . str_replace($this->lat(), $this->cyr(), $tag);
$tags_cyr[]='</' . str_replace($this->lat(), $this->cyr(), $tag) . '>';
$tags_lat[]= '<' . $tag;
$tags_lat[]= '</' . $tag . '>';
}
$tags_cyr = array_merge($tags_cyr, array('&нбсп;','&лт;','&гт;','&ндасх;','&мдасх;','хреф','срц','&лдqуо;','&бдqуо;','&лсqуо;','&рсqуо;','&сцарон;','&Сцарон;','&тилде;'));
$tags_lat = array_merge($tags_lat, array(' ','<','>','–','—','href','src','“','„','‘','’','ш','Ш','˜'));
$content = str_replace($tags_cyr, $tags_lat, $content);
$lastPos = 0;
$positions = [];
while (($lastPos = mb_strpos($content, '<', $lastPos, 'UTF-8')) !== false) {
$positions[] = $lastPos;
$lastPos = $lastPos + mb_strlen('<', 'UTF-8');
}
foreach ($positions as $position) {
if(mb_strpos($content, '>', 0, 'UTF-8') !== false) {
$end = mb_strpos($content, ">", $position, 'UTF-8') - $position;
$tag = mb_substr($content, $position, $end, 'UTF-8');
$tag_lat = $this->cyr_to_lat($tag);
$content = str_replace($tag, $tag_lat, $content);
}
}
// Fix open tags
$content = preg_replace_callback ('/(<[\x{0400}-\x{04FF}0-9a-zA-Z\/\=\"\'_\-\s\.\;\,\!\?\*\:\#\$\%\&\(\)\[\]\+\@\€]+>)/iu', function($m){
return $this->cyr_to_lat($m[1]);
}, $content);
// FIx closed tags
$content = preg_replace_callback ('/(<\/[\x{0400}-\x{04FF}0-9a-zA-Z]+>)/iu', function($m){
return $this->cyr_to_lat($m[1]);
}, $content);
// Fix HTML entities
$content = preg_replace_callback ('/\&([\x{0400}-\x{04FF}0-9]+)\;/iu', function($m){
return '&' . $this->cyr_to_lat($m[1]) . ';';
}, $content);
// Fix JavaScript
$content = preg_replace_callback('/(?=<script(.*?)>)(.*?)(?<=<\/script>)/s', function($matches) {
return $this->cyr_to_lat($m[2]);
}, $content);
// Fix CSS
$content = preg_replace_callback('/(?=<style(.*?)>)(.*?)(?<=<\/style>)/s', function($matches) {
return $this->cyr_to_lat($m[2]);
}, $content);
// Fix email
$content = preg_replace_callback ('/(([\x{0400}-\x{04FF}0-9\_\-\.]+)@([\x{0400}-\x{04FF}0-9\_\-\.]+)\.([\x{0400}-\x{04FF}0-9]{3,10}))/iu', function($m){
return $this->cyr_to_lat($m[1]);
}, $content);
// Fix URL
$content = preg_replace_callback ('/(([\x{0400}-\x{04FF}]{4,5}):\/{2}([\x{0400}-\x{04FF}0-9\_\-\.]+)\.([\x{0400}-\x{04FF}0-9]{3,10})(.*?)($|\n|\s|\r|\"\'\.\;\,\:\)\]\>))/iu', function($m){
return $this->cyr_to_lat($m[1]);
}, $content);
// Fix attributes with doublequote
$content = preg_replace_callback ('/(title|alt|data-(title|alt))\s?=\s?"(.*?)"/iu', function($m){
return sprintf('%1$s="%2$s"', $m[1], esc_attr($this->lat_to_cyr($m[3])));
}, $content);
// Fix attributes with single quote
$content = preg_replace_callback ('/(title|alt|data-(title|alt))\s?=\s?\'(.*?)\'/iu', function($m){
return sprintf('%1$s=\'%2$s\'', $m[1], esc_attr($this->lat_to_cyr($m[3])));
}, $content);
return $content;
}
The main question is whether and how can I convert bad HTML tags (Cyrillic HTML tags and attributes) to Latin and save all other text in Cyrillic?
<див цласс="цонтент">Мама воли бебу</див> ---> <div class="content">Мама воли бебу</div>
Can it be achieved with regex or is there a faster / better solution?