The reading loop of QXmlReader for PyQt5 does not return the expected data

411 Views Asked by At

I'd like to make an QAbstractItemModel that gets its data from a series of Xml files, all situated in the same directory. Since PyQt5 no longer supports QDomDocument (or atleast i couldn't find a way to make it work), i've had to resort to a QXmlStreamReader. I'm putting the data itself in a giant python dictionary (well... not exactly giant by computer science standards) that contains other dictionaries under various keys to create a tree-like structure.

this is my code so far:

class DataModel(QtCore.QAbstractItemModel):
    def __init__(self, settingsDirectory, parent = None):
        super(DataModel, self).__init__(parent)
        settingsDirectory.setNameFilters(["*.xml"])
        files = settingsDirectory.entryList()
        print(files)

        self.data = {}

        for i in range(len(files)):
            filePath = str(files[i])
            file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
            fileOpens = file.open(file.ReadOnly | file.Text)
            if fileOpens:
                parser = QtCore.QXmlStreamReader(file)
                print("--------Beginning parsing----------")
                print("Reading file: "+str(filePath))
                while not parser.atEnd():
                    parser.readNext()

                    token = parser.tokenType()

                    print("Reading tag: " + str(parser.name()))
                    print("Tag type is: " + str(token))
                    if token == parser.StartDocument:
                        self.data["XML Version"] = str(parser.documentVersion())
                        self.data["XML Encoding"] = str(parser.documentEncoding())
                    if token == parser.StartElement:
                        tokenName = parser.name()
                    if parser.tokenType() == parser.Characters:
                        tokenText = parser.text()
                        print("This tag has a text value: " + str(tokenText))
                        print("current data: " + str(self.data))
                    if token == parser.EndElement:
                        if tokenText != None:
                            self.data[tokenName] = tokenText
                        else:
                            self.data[tokenName] = {}
                        tokenName = None
                        tokenText = None
            else:
                print(self.tr("xml file did not open properly"))
        print(self.data)

While this code doesn't crash or anything, it does have a few issues that i have no idea why they're happening or how to fix:

1.the tokenName never changes from None for some reason - solved

2.the structure of the self.data dictionary does not turn into a tree-like one, no idea why :|

example data:

<?xml version="1.0" encoding="UTF-8"?>
<tag>
    <description>This is a text</description>
    <types>
        <typesAllowed></typesAllowed>
        <typesEnabled></typesEnabled>
    </types>
</tag>

yields the final result:

{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'typesAllowed': '\n\t\t', None: '\n', 'typesEnabled': '\n\t\t', 'description': 'This is a text'}

instead of the wanted:

{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'tag': {'description': 'this is a text', typesAllowed': '\n\t\t', 'typesEnabled': '\n\t\t'}}

I know these issues are most likely a result of my poor understanding of how a StreamReader works, so any and all tips would be welcome :)

edit 1:

the tokenName change was a silly positioning error, silly me. the code reflects the fix.

edit 2:

added an example and example output

1

There are 1 best solutions below

0
On BEST ANSWER

This question is now solved; I took a different approach to the problem.

I basically took a list into which i appended tuples (name, {}) if the StartElement token had the attribute parseAs == "element" and put an evaluated string (parseText function) into the last tuple's dictionary. When it meets an EndElement token, it finds the tuple with name == tokenName, which is the name of the current token, puts it into the previous tuple's dictionary as an entry with key name.

There's a few more details as to how it works, but I'd probably just overly complicate my explanation if I included them (how it knows when to submit currData to self.data etc.)

class DataModel(QtCore.QAbstractItemModel):
    def __init__(self, settingsDirectory, parent = None):
        super(DataModel, self).__init__(parent)
        settingsDirectory.setNameFilters(["*.xml"])
        files = settingsDirectory.entryList()
        print(files)

        self.data = {}
        self.parsingLog = {}

        for i in range(len(files)):
            filePath = str(files[i])
            file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
            fileOpens = file.open(file.ReadOnly | file.Text)
            if fileOpens:
                parser = QtCore.QXmlStreamReader(file)

                currData = []
                haveStartToken = False

                print(self.tr("--------Beginning parsing--------"))
                print(self.tr("Reading file: "+str(filePath)))
                print(self.tr("---------------------------------"))

                while not parser.atEnd():
                    if not parser.hasError():
                        parser.readNext()
                        token = parser.tokenType()

                        print(self.tr("--------------------"))
                        print(self.tr("Token type: " + str(self.printTokenType(token))))

                        if token == parser.StartElement:
                            tokenName = parser.name()
                            attributes = parser.attributes()
                            parseAs = attributes.value("parseAs")

                            print(self.tr("Reading StartElement: " + str(tokenName)))
                            print(self.tr("parseAs: " + str(parseAs)))

                            if parseAs == "text":
                                textValue = self.parseText(parser.readElementText())
                                print(self.tr("Text Value: " + str(textValue)))

                                if len(currData) != 0:
                                    currData[len(currData)-1][1][tokenName] = textValue
                                else:
                                    print(self.tr("*******Terminating application*******"))
                                    print(self.tr("Reason: currData is empty"))
                                    print(self.tr("*******Terminating application*******"))
                                    sys.exit()
                            elif parseAs == "element":
                                currData.append((tokenName, {}))
                            else:
                                print(self.tr("******WARNING******"))
                                print(self.tr("parseAs attribute is not given correctly"))
                                print(self.tr("******WARNING******"))

                            print(self.tr("--------------------"))

                        elif token == parser.EndElement:
                            tokenName = parser.name()

                            print(self.tr("Reading EndElement: " + str(tokenName)))
                            print(self.tr("currData before: " + str(currData)))

                            if not haveStartToken:
                                startToken = currData[0][0]
                                haveStartToken = True

                            for i in currData:
                                if i[0] == tokenName:
                                    print(self.tr("Closing token: " + str(tokenName)))
                                    if i[0] != startToken:
                                        currData[len(currData)-2][1][tokenName] = currData[len(currData)-1][1]
                                        del currData[len(currData)-1]
                                        print(self.tr("currData after: " + str(currData)))
                                        print(self.tr("--------------------"))
                                    elif i[0] == startToken:
                                        print(self.tr("This is the final token, writing to self.data"), end = "")
                                        self.data[startToken] = currData[0][1]
                                        for i in range(5):
                                            time.sleep(0.25)
                                            print(self.tr("."), end = "")
                                        print(self.tr("done."))
                                        print(self.tr("--------------------"))

                        elif token == parser.Characters:
                            print(self.tr("Characters value: " + str(parser.text())))
                            print(self.tr("--------------------"))

                        elif token == parser.StartDocument:
                            self.parsingLog["File: "+str(filePath)] = {}
                            self.parsingLog["File: "+str(filePath)]["XML Version"] = str(parser.documentVersion())
                            self.parsingLog["File: "+str(filePath)]["XML Encoding"] = str(parser.documentEncoding())
                            print(self.tr("File Version: " + str(self.parsingLog["File: "+str(filePath)]["XML Version"])))
                            print(self.tr("File Encoding: " + str(self.parsingLog["File: "+str(filePath)]["XML Encoding"])))

                        elif token == parser.EndDocument:
                            print(self.tr("Cleaning up"), end = "")
                            for i in range(5):
                                time.sleep(0.25)
                                print(self.tr("."), end = "")
                            time.sleep(0.1)
                            print(self.tr("done."))
                            print(self.tr("self.data: " + str(self.data)))
                            print(self.tr("types of data: yesNo (should be str) - " +
                                          str(type(self.data["building"]["specialSlot"]["yesNo"])) +
                                          " - id - should be int - " + str(type(self.data["building"]["specialSlot"]["id"])) +
                                          " - isItFloat - should be float - " + str(type(self.data["building"]["specialSlot"]["isItFloat"]))))
                            print(self.tr("--------------------"))

                    else:
                        print(self.tr("XML file is not well-formatted"))



            else:
                print(self.tr("xml file did not open properly"))

    def parseText(self, text):
        if isinstance(text, str):
            if text == "":
                return str(text)
            for i in text:
                if i not in ("0123456789."):
                    return str(text)
            for j in text:
                if j not in ("0123456789"):
                    return float(text)
            return int(text)
        else:
            return ValueError

    def printTokenType(self, token):
        if token == QtCore.QXmlStreamReader.NoToken:
            return "NoToken"
        elif token == 1:
            return "Invalid"
        elif token == QtCore.QXmlStreamReader.StartDocument:
            return "StartDocument"
        elif token == QtCore.QXmlStreamReader.EndDocument:
            return "EndDocument"
        elif token == QtCore.QXmlStreamReader.StartElement:
            return "StartElement"
        elif token == QtCore.QXmlStreamReader.EndElement:
            return "EndElement"
        elif token == QtCore.QXmlStreamReader.Characters:
            return "Characters"
        elif token == QtCore.QXmlStreamReader.Comment:
            return "Comment"
        elif token == QtCore.QXmlStreamReader.DTD:
            return "DTD"
        elif token == QtCore.QXmlStreamReader.EntityReference:
            return "EntityReference"
        elif token == QtCore.QXmlStreamReader.ProcessingInstruction:
            return "ProcessingInstruction"