Java: How to split XML stream into small XML documents with parents node. VTD-XML

I need to read a large xml with VTD XML and XPath and split results in multiple nodes. I found some solutions here but it split nodes but without parents information.

What why I'm looking for:

XPath string: /CATALOG/MAIN/CD Based on XPath document should be splitted

1) Initial document:

<CATALOG> <MAIN id="1"> <CD> <TITLE>Empire Burlesque</TITLE> <ARTIST>Bob Dylan</ARTIST> </CD> <CD> <TITLE>Empire Dummy</TITLE> <ARTIST>John Doe</ARTIST> </CD> <USEFUL>Useful node</USEFUL> </MAIN> <MAIN id="2"> <CD> <TITLE>Still got the blues</TITLE> <ARTIST>Gary More</ARTIST> </CD> </MAIN> <IGNORED>Ignored node</IGNORED> </CATALOG>

2) Results: Document 1:

<CATALOG> <MAIN id="1"> <CD> <TITLE>Empire Burlesque</TITLE> <ARTIST>Bob Dylan</ARTIST> </CD> <USEFUL>Useful node</USEFUL> </MAIN> </CATALOG>

Document 2:

<CATALOG> <MAIN id="1"> <CD> <TITLE>Empire Dummy</TITLE> <ARTIST>John Doe</ARTIST> </CD> <USEFUL>Useful node</USEFUL> </MAIN> </CATALOG>

Document 3:

<CATALOG> <MAIN id="2"> <CD> <TITLE>Still got the blues</TITLE> <ARTIST>Gary More</ARTIST> </CD> </MAIN> </CATALOG>

Thanks for your time and suggestions.

Best regards!


Herez my approach...

Use some xml parsing libraries e.g javax.xml.parsers.DocumentBuilderFactory Create a DOM for your input xml file... For each node encountered, create a new output file Document.xml e.g Document1.xml (add the child node under parent.

You might refer for sample java code to parse xml using java.xml.parsers.* package (look for loadQuestions example)


I resolved my problem. Here is my approach based on standard SAX parsing.

1) Created custom SaxHandler:

` public class CustomSAXHandler extends DefaultHandler {

private Stack<XmlNodeInfo> nodeStack = new Stack<XmlNodeInfo>();
private List<String> xPaths;
private XmlNodeInfo rootNode;
private final NamespaceContext namespaceContext;
private List<XmlNodeInfo> resultNodes;

public CustomSAXHandler(String xpath, XmlNodeInfo rootNode, NamespaceContext namespaceContext) {
    this.rootNode = rootNode;
    this.namespaceContext = namespaceContext;
    resultNodes = new ArrayList<XmlNodeInfo>();
    xPaths = splitXpaths(xpath);

public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
    String element = "<" + qName + getAttributes(atts) + ">";
    if (!nodeStack.empty()) {
        rootNode = nodeStack.peek();

    if (matchDefinedXpath(qName)) {
        XmlNodeInfo newNode = new XmlNodeInfo(qName);

    } else {
        if (!nodeStack.empty()) {

public void characters(char[] ch, int start, int length) throws SAXException {
    XmlNodeInfo currentNode = nodeStack.empty() ? null : nodeStack.peek();
    if (currentNode != null) {
        currentNode.getBody().append(new String(ch, start, length));

public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
    String finalElement = xPaths.get(xPaths.size() - 1);
    String element = "</" + qName + ">";
    XmlNodeInfo currentNode = nodeStack.empty() ? null : nodeStack.peek();
    if (currentNode != null) {
        if (qName.equals(finalElement) && nodeStack.size() == xPaths.size()) {
        } else {
            if (currentNode.getName().equals(qName)) {
            } else {

public List<String> getResults() {
    List<String> results = new ArrayList<String>();

    for (XmlNodeInfo node : resultNodes) {
        buildDocument(node, null, results);

    return results;

private void buildDocument(XmlNodeInfo node, String childContent, List<String> results) {
    String body = node.getBody().toString();
    if (childContent != null) {
        body = body + childContent;
    if (node.getParent() != null && !node.getParent().getName().equals(XmlNodeInfo.ROOT_NODE_NAME)) {
        String xmlContent = String.valueOf(node.getHeader()) + body + node.getFooter();
        buildDocument(node.getParent(), xmlContent, results);
    } else if (node.getParent() != null && node.getParent().getName().equals(XmlNodeInfo.ROOT_NODE_NAME)) {
        String finalContent = String.valueOf(node.getHeader()) + body + node.getFooter();

private String getAttributes(Attributes atts) {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < atts.getLength(); i++) {
        String qName = atts.getQName(i);
        String value = atts.getValue(qName);
        builder.append(" ").append(qName).append("=").append("\"").append(value).append("\"");
    return builder.toString();

private boolean matchDefinedXpath(String nodeName) {
    String[] splitWords = nodeName.split(":");
    if (splitWords.length == 2) {
        String namespacePrefix = splitWords[0];
        String namespaceURI = namespaceContext.getNamespaceURI(namespacePrefix);
        Iterator prefixes = namespaceContext.getPrefixes(namespaceURI);
        while (prefixes.hasNext()) {
            String prefix = (String);
            String elementName = prefix + ":" + splitWords[1];
            if (xPaths.contains(elementName)) {
                return true;
    } else {
        return xPaths.contains(nodeName);
    return false;

private List<String> splitXpaths(String xPath) {
    if (StringUtils.isNotBlank(xPath)) {
        String[] splitWords = xPath.split("/");
        if (splitWords.length > 0) {
            List<String> results = new ArrayList<String>();
            for (String splitWord : splitWords) {
            return results;
    return null;



2) Create a bean to store node data:


public class XmlNodeInfo {

public static final String ROOT_NODE_NAME = "ROOT";

private String name;
private StringBuilder header;
private StringBuilder body;
private StringBuilder footer;
private List<XmlNodeInfo> children;
private XmlNodeInfo parent;

public XmlNodeInfo(String name) { = name;
    header = new StringBuilder();
    body = new StringBuilder();
    footer = new StringBuilder();
    children = new ArrayList<XmlNodeInfo>();

public StringBuilder getHeader() {
    return header;

public StringBuilder getBody() {
    return body;

public StringBuilder getFooter() {
    return footer;

public List<XmlNodeInfo> getChildren() {
    return children;

public void addChild(XmlNodeInfo xmlNodeInfo) {

public String getName() {
    return name;

public XmlNodeInfo getParent() {
    return parent;

public void setParent(XmlNodeInfo parent) {
    this.parent = parent;



3) Run program:

` public class MainApp {

public static void main(String[] args) throws Exception {
    SAXParserFactory factory = SAXParserFactory.newInstance();
    SAXParser saxParser = factory.newSAXParser();

    NamespaceContext namespaceContext = new XmlNamespaceResolver();
    String xPath = "/CATALOG/MAIN/CD";

    InputStream in = MainApp.class.getClassLoader().getResourceAsStream("test.xml");
    XmlNodeInfo rootNode = new XmlNodeInfo(XmlNodeInfo.ROOT_NODE_NAME);
    CustomSAXHandler customSAXHandler = new CustomSAXHandler(xPath, rootNode, namespaceContext);
    saxParser.parse(in, customSAXHandler);
    List<String> results = customSAXHandler.getResults(); // result strings



Maybe is not the best solution but it resolve my problems. Thanks all for your time and suggestions.


Below is the code that does what you described in vtd-xml. let me know if there is any question.

import com.ximpleware.*;

public class splitTest {

    public static void  main(String[] a) throws VTDException,{
        VTDGen vg = new VTDGen();
        if (vg.parseFile("C:\\Users\\Jimmy Zhang\\workspace\\ximple-dev\\DOMTest\\test111.xml", false)){
            VTDNav vn = vg.getNav();
            AutoPilot ap = new AutoPilot(vn);
            byte[] header = "<CATALOG>".getBytes();
            byte[] tail = "</CATALOG>".getBytes();
            int i = -1,j=0;
                long l = vn.getElementFragment();
                FileOutputStream fops = new FileOutputStream("c:\\xml\\output"+j+".xml");
                fops.write(vn.getXML().getBytes(), (int)l, ((int)(l>>32)));
