parsing HTML in swift

4.5k Views Asked by At

Can anyone help me out with this one:

I have a HTTP page formatted this way:

<ul class="ms-spbTree" id="_ul">
    <li id="XXX\$username">Person0
        <ul id="XXX\$username_ul">
            <li id="XXX\$username">Person1
                <ul id="XXX\$username_ul"></ul>
            </li>
            <li id="XXX\$username">Person2
                <ul id="XXX\$username_ul"></ul>
            </li>
            <li id="XXX\$username">Person3
                <ul id="XXX\$username_ul"></ul>
            </li>
            <li id="XXX\$username">Person4
                <ul id="XXX\$username_ul">
                    <li id="XXX\$username">Person5
                        <ul id="XXX\$username_ul"></ul>
                    </li>
                    <li id="XXX\$username">Person6
                        <ul id="XXX\$username_ul"></ul>
                    </li>
                    <li id="XXX\$username">Person7
                        <ul id="XXX\$username_ul"></ul>
                    </li>
                </ul>
            </li>
            <li id="XXX\$username">Person8
                <ul id="XXX\$username_ul"></ul>
            </li>
        </ul>
    </li>
</ul>

My function is:

func loadTutorial(webString: NSString)
{
    var data : NSData = webString.dataUsingEncoding(NSUTF8StringEncoding)!

    var tutorialsParser = TFHpple(HTMLData: data)
    var tutorialsXPathString = "//ul/li"
    var tutorialNodes = tutorialsParser.searchWithXPathQuery(tutorialsXPathString) as NSArray
    if(tutorialNodes.count == 0)
    {
        println("empty here")
    }
    else
    {
        for element in tutorialNodes
        {
            var elementTwo: TFHppleElement = element as TFHppleElement
         //   var tutorial = Tutorial()
            println("\(elementTwo.firstChild.content)")

       //     println(elementTwo.raw)

            let userscanner = NSScanner(string:elementTwo.raw)
            var userscanned: NSString?

            if userscanner.scanUpToString("li id=\"", intoString:nil){
                userscanner.scanString("li id=\"", intoString:nil)
                if userscanner.scanUpToString("\">", intoString:&userscanned) {
                    let newResult: String = userscanned as String

                    println("NewResultValue: \(newResult)")
                }
            }
        }
    }
}

But what i get is the following list:

Person0
Person1
Person2
Person3
Person4
Person5
Person6
Person7
Person8

I only want to retrieve

Person0
Person1
Person2
Person3
Person4
Person8

Or i list so i know that Person5,6 and 7 are having Person4 as manager.

I tried with

var tutorialsXPathString = "//ul[not(contains(@style, 'style=\"display: none;\"'))/li"

Because all sub persons (5,6,7) have this in ther tag, but it did not work :(

Any help would be appreciated.

1

There are 1 best solutions below

0
On BEST ANSWER

A couple of thoughts:

  1. The use of // says "find this anywhere in the HTML". If you want to control what level you want to consider, just use / and follow this from the root of the document. For example, to get the second level, but not the first or third levels, you'd do something like:

    let tutorialsParser = TFHpple(HTMLData: data)
    let tutorialsXPathString = "/html/body/ul/li/ul/li"
    if let tutorialNodes = tutorialsParser.searchWithXPathQuery(tutorialsXPathString) as? [TFHppleElement] {
        for element in tutorialNodes {
            let content = element.firstChild.content.stringByTrimmingCharactersInSet(NSCharacterSet.whitespaceAndNewlineCharacterSet())
            let identifier = element.attributes["id"] as String
            println("id = \(identifier); content = \(content)")
        }
    }
    
  2. Note, I'm not sure why you were using the scanner, but if you want the attributes of an element, you can use the attributes method.

  3. I also defined the tutorialNodes to be an array of TFHppleElement objects, which simplifies the for loop a bit.

  4. If you wanted the top level /ul/li followed by the second level, but not the third level, you could do something like:

    let tutorialsParser = TFHpple(HTMLData: data)
    let tutorialsXPathString = "/html/body/ul/li"
    if let tutorialNodes = tutorialsParser.searchWithXPathQuery(tutorialsXPathString) as? [TFHppleElement] {
        for element in tutorialNodes {
            let content = element.firstChild.content.stringByTrimmingCharactersInSet(NSCharacterSet.whitespaceAndNewlineCharacterSet())
            let identifier = element.attributes["id"] as String
            println("id = \(identifier); content = \(content)")
    
            if let ul = element.childrenWithTagName("ul") as? [TFHppleElement] {
                if let li = ul.first?.childrenWithTagName("li") as? [TFHppleElement] {
                    for element in li {
                        let content = element.firstChild.content.stringByTrimmingCharactersInSet(NSCharacterSet.whitespaceAndNewlineCharacterSet())
                        let identifier = element.attributes["id"] as String
                        println("  child id = \(identifier); content = \(content)")
                    }
                }
            }
        }
    }
    

    Or you could do something like:

    let tutorialsParser = TFHpple(HTMLData: data)
    let tutorialsXPathString = "/html/body/ul/li"
    if let tutorialNodes = tutorialsParser.searchWithXPathQuery(tutorialsXPathString) as? [TFHppleElement] {
        for element in tutorialNodes {
            let content = element.firstChild.content.stringByTrimmingCharactersInSet(NSCharacterSet.whitespaceAndNewlineCharacterSet())
            let identifier = element.attributes["id"] as String
            println("id = \(identifier); content = \(content)")
    
            if let children = element.searchWithXPathQuery("/html/body/li/ul/li") as? [TFHppleElement] {
                for element in children {
                    let content = element.firstChild.content.stringByTrimmingCharactersInSet(NSCharacterSet.whitespaceAndNewlineCharacterSet())
                    let identifier = element.attributes["id"] as String
                    println("  child id = \(identifier); content = \(content)")
                }
            }
        }
    }