Python - comparing sublists within two different lists

2k Views Asked by At

I have two lists, each containing sublists in the form [chromosome, start_position, end_position]:

expos_list = [['1', '10', '30'], ['1', '50', '80'], ['1', '100', '200']]  
pos_list = [['1', '12', '25'], ['1', '90', '98'], ['1', '130', '180'], ['2', '25', '50']]  

I want to compare the sublists in 'pos_list' to the sublists in 'expos_list' and then add a 'pos_list' element to 'expos_list' if it is unique and/or is not contained within another expos_list element. So I would like my final output to be:

expos_list = [['1', '10', '30'], ['1', '50', '80'], ['1', '90', '98'], ['1', '100', '200'], ['2', '25', '50']]  

...as this has only unique position ranges for each particular chromosome (chromosome = sublist[0] in both cases).

I have tried:

for expos_element in expos_list:
    for pos_element in pos_list:
        if pos_element[0] == expos_element[0]:
            if pos_element[1] < expos_element[1]:
                if pos_element[2] < expos_element[1]:
                    print("New")
                elif pos_element[2] < expos_element[2]:
                print("Overlapping at 3'")
                else:
                print("Discard")
            elif expos_element[1] <= pos_element[1] < expos_element[2]:
                if pos_element[2] <= expos_element[2]:
                print("Discard")
                else:
                print("Overlapping at 5'")
            else:
            print("Hit is 3' of current existing element. Move on")
        else:
        print("Different chromosome")

Which obviously doesn't do the append to list bit etc, but is specifying whether there is an overlap of the elements. It does this, but compares all elements all the time, giving this output:

Discard
Hit is 3' of current existing element. Move on
Discard
Different chromosome
New
Hit is 3' of current existing element. Move on
New
Different chromosome
Overlapping at 5'
Hit is 3' of current existing element. Move on
Discard
Different chromosome  

This gives 12 lines of output (rather than the desired one line per sublist in pos_list). I'm really struggling to get this working. I guess my ideal output for the above run would be:

Discard
New
Discard
Different chromosome  

Any help would be greatly appreciated. Thanks!

3

There are 3 best solutions below

8
On BEST ANSWER

If you're not that interested in how each item overlaps, simplify the code to your three cases (discard, new, different):

new_items = []
for item in pos_list:
    if not any(x[0] == item[0] for x in expos_list):
        print("Different chromosome")
        new_items.append(item)
    elif any(x[1] < item[1] < x[2] or x[1] < item[2] < x[2]
             for x in expos_list):
        print("Discard")
    else:
        print("New")
        new_items.append(item)
expos_list.extend(new_items)
print(expos_list)

When I run it, I see:

Discard
New
Discard
Different chromosome
[['1', '10', '30'], ['1', '50', '80'], ['1', '100', '200'], ['1', '90', '98'], ['2', '25', '50']]
3
On

I've separated it out a bit; try

from collections import defaultdict
from bisect import bisect_left

class ChromoSegments:
    def __init__(self, cs=None):
        # A list of [(start, end), (start, end), ...] per chromosome;
        #   each list is kept sorted in ascending order
        self.segments = defaultdict(list)

        # Add segments from parameter list
        if cs is not None:
            for chromo,start,end in cs:
                try:
                    self.add_seg(chromo, start, end)
                except ValueError:
                    pass

    def add_seg(self, chromo, start, end):
        seg = self.segments[chromo]
        val = (start, end)
        ndx = bisect_left(seg, val)
        if (ndx == 0 or seg[ndx - 1][1] < start):
            if (ndx == len(seg) or end < seg[ndx][0]):
                seg.insert(ndx, val)
            else:
                # collision with following element
                nstart, nend = seg[ndx]
                raise ValueError('Discard ({}, {}, {}): collision with ({}, {}, {})'.format(chromo, start, end, chromo, nstart, nend))
        else:
            # collision with preceding element
            nstart, nend = seg[ndx - 1]
            raise ValueError('Discard ({}, {}, {}): collision with ({}, {}, {})'.format(chromo, start, end, chromo, nstart, nend))

    def to_list(self):
        keys = sorted(self.segments.keys())
        return [(k, s, e) for k in keys for s,e in self.segments[k]]

def main():
    expos = ChromoSegments([[1, 10, 30], [1, 50, 80], [1, 100, 200]])
    pos = [[1, 12, 25], [1, 90, 98], [1, 130, 180], [2, 25, 50]]

    target_chromo = 1
    for seg in pos:
        if seg[0] != target_chromo:
            print('Different chromosome')
        else:
            try:
                expos.add_seg(*seg)
                print('New')
            except ValueError, e:
                print(e.message)

    print('\nResult: {}'.format(expos.to_list()))

if __name__ == "__main__":
    main()

which produces

Discard (1, 12, 25): collision with (1, 10, 30)
New
Discard (1, 130, 180): collision with (1, 100, 200)
Different chromosome

Result: [(1, 10, 30), (1, 50, 80), (1, 90, 98), (1, 100, 200)]

Note that I wrote the class to properly deal with multiple chromosomes; the 'different chromosome' warning had to be handled separately in main().

Edit:

If you want to deal with multiple chromosomes, main() can be simplified like so:

def main():
    expos = ChromoSegments([[1, 10, 30], [1, 50, 80], [1, 100, 200]])
    pos = [[1, 12, 25], [1, 90, 98], [1, 130, 180], [2, 25, 50]]

    for seg in pos:
        try:
            expos.add_seg(*seg)
            print('New')
        except ValueError, e:
            print(e.message)

    print('\nResult: {}'.format(expos.to_list()))

and the output becomes

Discard (1, 12, 25): collision with (1, 10, 30)
New
Discard (1, 130, 180): collision with (1, 100, 200)
New

Result: [(1, 10, 30), (1, 50, 80), (1, 90, 98), (1, 100, 200), (2, 25, 50)]
0
On

This one generates a range from the starting and end points of your DNA snippets and simply checks if the positions in a new snippet already exist in the existing ranges. Multiple chromosomes taken care of by dict as suggested by someone else.

class DNASegments():
    def __init__(self, segments):
        segments = self._convert(segments)
        self.segments = {}
        for s in segments:
            self.add_segment(s)

    def add_segment(self, s):
        chromosome = s[0]
        positions = range(s[1], s[2]+1)

        if not chromosome in self.segments.keys():
            self.segments[chromosome] = []

        if not any([p in range(ps[0], ps[1]+1) for ps\
                    in self.segments[chromosome] for p in positions]):
            self.segments[chromosome].append([s[1], s[2]])

        self._sort()

    def add_list(self, segments):
        segments = self._convert(segments)
        for s in segments:
            self.add_segment(s)
        self._sort()

    def _convert(self,segments):
        return [[int(s[0]), int(s[1]), int(s[2])] for s in segments]
    def _sort(self):
        for key in self.segments.keys():
            self.segments[key] = sorted(self.segments[key])

    def __repr__(self):
        return str(self.segments)
    def __str__(self):
        return str(self.segments)
    def __getitem__(self, i):
        return self.segments[i]



expos_list = [['1', '10', '30'], ['1', '50', '80'], ['1', '100', '200']]  
pos_list = [['1', '12', '25'], ['1', '90', '98'], ['1', '130', '180'], 
            ['2', '25', '50']]

dnaseg = DNASegments(expos_list)

dnaseg
>>> {1: [[10, 30], [50, 80], [100, 200]]}

dnaseg.add_list(pos_list)

dnaseg
>>> {1: [[10, 30], [50, 80], [90, 98], [100, 200]], 2: [[25, 50]]}