decrease counter in iterable obtained using 'enumerate' after a calling seek

395 Views Asked by At

I am reading a file using Python, and within the file there are sections that are enclosed with the '#' character:

#HEADER1, SOME EXTRA INFO
data first section
1 2
1 233 
...
// THIS IS A COMMENT
#HEADER2, SECOND SECTION
452
134
// ANOTHER COMMENT
...
#HEADER3, THIRD SECTION

Now I wrote code to read the file as follows:

with open(filename) as fh:

    enumerated = enumerate(iter(fh.readline, ''), start=1)

    for lino, line in enumerated:

        # handle special section
        if line.startswith('#'):

            print("="*40)
            print(line)

            while True:

                start = fh.tell()
                lino, line = next(enumerated)

                if line.startswith('#'):
                    fh.seek(start)
                    break

                print("[{}] {}".format(lino,line))

The output is:

========================================
#HEADER1, SOME EXTRA INFO

[2] data first section

[3] 1 2

[4] 1 233 

[5] ...

[6] // THIS IS A COMMENT

========================================
#HEADER2, SECOND SECTION

[9] 452

[10] 134

[11] // ANOTHER COMMENT

[12] ...

========================================
#HEADER3, THIRD SECTION

Now you see that the line counter lino is no longer valid because I'm using seek. Also, it won't help I decrease it before breaking the loop because this counter is increased with each call to next. So is there an elegant way to solve this problem in Python 3.x? Also, is there a better way of solving the StopIteration without putting a pass statement in an Except block?

UPDATE

So far I have adopted an implementation based on the suggestion made by @Dunes. I had to change it a bit so I can peek ahead to see if a new section is starting. I don't know if there's a better way to do this, so please jump in with comments:

class EnumeratedFile:

    def __init__(self, fh, lineno_start=1):
        self.fh = fh
        self.lineno = lineno_start

    def __iter__(self):
        return self

    def __next__(self):
        result = self.lineno, self.fh.readline()
        if result[1] == '':
            raise StopIteration

        self.lineno += 1
        return result

    def mark(self):
        self.marked_lineno = self.lineno
        self.marked_file_position = self.fh.tell()

    def recall(self):
        self.lineno = self.marked_lineno
        self.fh.seek(self.marked_file_position)

    def section(self):
        pos = self.fh.tell()
        char = self.fh.read(1)
        self.fh.seek(pos)
        return char != '#'

And then the file is read and each section is processed as follows:

# create enumerated object
e = EnumeratedFile(fh)

header = ""
for lineno, line, in e:

    print("[{}] {}".format(lineno, line))

    header = line.rstrip()

    # HEADER1
    if header.startswith("#HEADER1"):

        # process header 1 lines
        while e.section():

            # get node line
            lineno, line = next(e)
            # do whatever needs to be done with the line

     elif header.startswith("#HEADER2"):

         # etc.
2

There are 2 best solutions below

0
On

You cannot alter the counter of the enumerate() iterable, no.

You don't need to at all here, nor do you need to seek. Instead use a nested loop and buffer the section header:

with open(filename) as fh:
    enumerated = enumerate(fh, start=1)
    header = None
    for lineno, line in enumerated:
        # seek to first section
        if header is None:
            if not line.startswith('#'):
                continue
            header = line

        print("=" * 40)
        print(header.rstrip())
        for lineno, line in enumerated:
            if line.startswith('#'):
                # new section
                header = line
                break

            # section line, handle as such
            print("[{}] {}".format(lineno, line.rstrip()))

This buffers the header line only; every time we come across a new header, it is stored and the current section loop is ended.

Demo:

>>> from io import StringIO
>>> demo = StringIO('''\
... #HEADER1, SOME EXTRA INFO
... data first section
... 1 2
... 1 233 
... ...
... // THIS IS A COMMENT
... #HEADER2, SECOND SECTION
... 452
... 134
... // ANOTHER COMMENT
... ...
... #HEADER3, THIRD SECTION
... ''')
>>> enumerated = enumerate(demo, start=1)
>>> header = None
>>> for lineno, line in enumerated:
...     # seek to first section
...     if header is None:
...         if not line.startswith('#'):
...             continue
...         header = line
...     print("=" * 40)
...     print(header.rstrip())
...     for lineno, line in enumerated:
...         if line.startswith('#'):
...             # new section
...             header = line
...             break
...         # section line, handle as such
...         print("[{}] {}".format(lineno, line.rstrip()))
... 
========================================
#HEADER1, SOME EXTRA INFO
[2] data first section
[3] 1 2
[4] 1 233
[5] ...
[6] // THIS IS A COMMENT
========================================
#HEADER2, SECOND SECTION
[9] 134
[10] // ANOTHER COMMENT
[11] ...
>>> header
'#HEADER3, THIRD SECTION\n'

The third section remains unprocessed because there were no lines in it, but had there been, the header variable has already been set in anticipation.

6
On

You can copy the iterator, and then restore the iterator from that copy. However, you can't copy file object. You could take a shallow copy of the enumerator and then seek to the respective part of the of file when you start using the copied enumerator.

However, the best thing to do would be to write your generator class, with a __next__ method to produce line numbers and lines, and mark and recall methods to record and return to a previously recorded state.

class EnumeratedFile:

    def __init__(self, fh, lineno_start=1):
        self.fh = fh
        self.lineno = lineno_start

    def __iter__(self):
        return self

    def __next__(self):
        result = self.lineno, next(self.fh)
        self.lineno += 1
        return result

    def mark(self):
        self.marked_lineno = self.lineno
        self.marked_file_position = self.fh.tell()

    def recall(self):
        self.lineno = self.marked_lineno
        self.fh.seek(self.marked_file_position)

You would use it like thus:

from io import StringIO
demo = StringIO('''\
#HEADER1, SOME EXTRA INFO
data first section
1 2
1 233 
...
// THIS IS A COMMENT
#HEADER2, SECOND SECTION
452
134
// ANOTHER COMMENT
...
#HEADER3, THIRD SECTION
''')

e = EnumeratedFile(demo)
seen_header2 = False
for lineno, line, in e:
    if seen_header2:
        print(lineno, line)
        assert (lineno, line) == (2, "data first section\n")
        break
    elif line.startswith("#HEADER1"):
        e.mark()
    elif line.startswith("#HEADER2"):
        e.recall()
        seen_header2 = True