Replacing redundant HTML string using Jsoup, Pattern class and StringBuilder

43 Views Asked by At

I have a big html string in form of byte[] which size is about 59kb and I need to decrease it as much as I can. I manage to remove some redundant string and hit size of 36kb after parsing. However, I think I'm creating too many Strings on the heap and I want to avoid that with StringBuilder if possible.

Here is the code version that minifies the content (this should be optimized with Pattern class and StringBuilder):

public static void main(String[] args) {
    Content emailMessage = contentRepository.findById(emailId).get();
    log.info("before compression size is: {}", emailMessage.getEmailBody().length); //59kb
    long start = System.currentTimeMillis();
    String content = new String(emailMessage.getEmailBody(), StandardCharsets.UTF_8);
    Document doc = Jsoup.parse(content);
    doc.outputSettings().prettyPrint(false); //make it single line
    removeCommentsWithinTextNodesAndStyleTags(doc); //parsing recursively
    String trimmed = doc.outerHtml();
    log.info("Duration of trimming: {}", System.currentTimeMillis()-start); //193ms

    log.info("TRIMMED EMAIL HTML CONTENT START :::");
    log.info(trimmed);
    log.info("TRIMMED EMAIL HTML CONTENT END :::");
    log.info("Size of trimmed html: {}", trimmed.getBytes(StandardCharsets.UTF_8).length); //36kb
}

private static void removeCommentsWithinTextNodesAndStyleTags(Node node) {
    node.childNodes().stream().filter(n -> "#comment".equals(n.nodeName())).forEach(Node::remove);
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        String text = textNode.text();
        text = text.replaceAll("\\s{3,}", " "); //replace anything that has two or more whitespace with single whitespace

        // Remove comments in the /* ... */ format within text
        text = text.replaceAll("/\\*+[^*]*\\*+(?:[^/*][^*]*\\*+)*/", "");

         // Remove comments 
        text = text.replaceAll("<!--.*?-->", "");
        textNode.text(text);
    } else if (node instanceof Element) {
        Element element = (Element) node;

        // Check if the element is a <style> tag
        if ("style".equalsIgnoreCase(element.tagName())) {
            String styleText = element.data();
            styleText = styleText.replaceAll("\\s{3,}", " ");
            styleText = styleText.replaceAll("/\\*+[^*]*\\*+(?:[^/*][^*]*\\*+)*/", "");
            styleText = styleText.replaceAll("\\n", "").replaceAll("\\r", "");
            styleText = styleText.replaceAll("<!--.*?-->", "");
            // Set the content of the <style> tag using .html()
            element.html(styleText);
        }
    }

    for (Node child : node.childNodes()) {
        removeCommentsWithinTextNodesAndStyleTags(child);
    }
}

As I said this works fairly quickly, but I'm not sure what happens with string objects and if there is a way to optimize that if possible. Here's my attempt with Pattern and StringBuilder which causes JavaHeap OutOfMemory (yeah, I got an exception which I want to avoid by trying to optimize code):

public static final Pattern p1 = Pattern.compile("\\s{3,}");
public static final Pattern p2 = Pattern.compile("/\\*+[^*]*\\*+(?:[^/*][^*]*\\*+)*/");
public static final Pattern p3 = Pattern.compile("<!--.*?-->");
private static void removeCommentsWithinTextNodesAndStyleTags22(Node node) {
    node.childNodes().stream().filter(n -> "#comment".equals(n.nodeName())).forEach(Node::remove);
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;

        StringBuilder stringBuilder = new StringBuilder(textNode.text());

        Matcher m1 = p1.matcher(stringBuilder);
        stringBuilder.replace(0, stringBuilder.length(), m1.replaceAll(stringBuilder.toString()));

        Matcher m2 = p2.matcher(stringBuilder);
        stringBuilder.replace(0, stringBuilder.length(), m2.replaceAll(stringBuilder.toString()));

        Matcher m3 = p3.matcher(stringBuilder);
        stringBuilder.replace(0, stringBuilder.length(), m3.replaceAll(stringBuilder.toString()));

        textNode.text(stringBuilder.toString());
    } else if (node instanceof Element) {
        Element element = (Element) node;

        // Check if the element is a <style> tag
        if ("style".equalsIgnoreCase(element.tagName())) {


            StringBuilder stringBuilder = new StringBuilder(element.data());

            Matcher m1 = p1.matcher(stringBuilder);
            stringBuilder.replace(0, stringBuilder.length(), m1.replaceAll(stringBuilder.toString()));

            Matcher m2 = p2.matcher(stringBuilder);
            stringBuilder.replace(0, stringBuilder.length(), m2.replaceAll(stringBuilder.toString()));

            Matcher m3 = p3.matcher(stringBuilder);
            stringBuilder.replace(0, stringBuilder.length(), m3.replaceAll(stringBuilder.toString()));

            element.html(stringBuilder.toString());
        }
    }

    for (Node child : node.childNodes()) {
        removeCommentsWithinTextNodesAndStyleTags22(child);
    }
}
0

There are 0 best solutions below