Count total unique number of records based on columns from multiple VCF files

920 Views Asked by At

I have about 200 files having long header lines that starts with "#" and then records with 4 columns like the following:

file_1.vcf
##some_comments that span many lines
##some_comments that span many lines
#CRHOM    POS    REF    ALT
chr1   111    A       G
chr2   222    G       T
chrY   99999  A       C

file_2.vcf
##some_comments that span many lines
##some_comments that span many lines
#CRHOM    POS    REF    ALT
chr2   444    G       T
chr2   222    G       T
chrY   7473  C       A

For this simple example with only 2 vcf, the expected total number of records is 5 records.

Each of the files contain a million or so records and I am trying to count how many total number of records they are with one condition that a duplicate record appeared in any other vcf file must be counted only once. How can I accomplish this, I attempt this in Python but this stuck for very long time:

    import os
    from pathlib import Path
    import vcf

    path_gdc_data = Path(os.getcwd()+"/all_vcfs/")

    non_dup = []
    count += 1
    for i, vcf_file in enumerate(path_gdc_data.rglob("*.vcf")):
        vcf_reader = vcf.Reader(filename=str(vcf_file))
        for rec in vcf_reader:
            if (rec.CHROM,rec.POS, rec.REF, rec.ALT) not in indels_coord_uniq:
                    non_dup_records.append((rec.CHROM, rec.POS))
                    count += 1
    print(f"Total num of non dup in all vcf are {count}")

Note1: I include pandas and Dataframe tags in my question to see if pandas would be more efficient solution for this.

Note2: Each individual file can not have duplicated records and each file is sorted.

Sample vcf file:

##fileformat=VCFv4.0
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=1000GenomesPilot-NCBI36
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
#CHROM POS     ID        REF ALT    QUAL FILTER INFO                              FORMAT      NA00001        NA00002        NA00003
20     14370   rs6054257 G      A       29   PASS   NS=3;DP=14;AF=0.5;DB;H2           GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20     17330   .         T      A       3    q10    NS=3;DP=11;AF=0.017               GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3   0/0:41:3
20     1110696 rs6040355 A      G,T     67   PASS   NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2   2/2:35:4
20     1230237 .         T      .       47   PASS   NS=3;DP=13;AA=T                   GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
20     1234567 microsat1 GTCT   G,GTACT 50   PASS   NS=3;DP=9;AA=G                    GT:GQ:DP    0/1:35:4       0/2:17:2       1/1:40:3

Many thanks in advance.

2

There are 2 best solutions below

8
On

Another method using Pandas:

import pandas as pd
import io
from pathlib import Path

COLS = ["CRHOM", "POS", "REF", "ALT"]

def read_vcf(filepath_or_buffer, usecols=None):
    with open(filepath_or_buffer) as vcf:
        while True:
            line = vcf.readline()
            if not line.startswith("##"):
                names = line[1:].split()
                break
        return pd.read_table(vcf, sep="\t", delim_whitespace=True,
                             usecols=usecols, header=None, names=names)


# Get vcf file list and load the first to a dataframe
path_gdc_data = Path().cwd() / "all_vcfs"
vcf_files = path_gdc_data.glob("*.vcf")
# df = read_vcf(next(vcf_files), usecols=COLS)
idx = read_vcf(next(vcf_files), usecols=COLS).set_index(COLS).index


# Compare and merge other dataframes
for vcf in vcf_files:
    # df1 = read_vcf(vcf, usecols=COLS)
    idx1 = read_vcf(vcf, usecols=COLS).set_index(COLS).index
    # df = df.merge(df1, on=COLS, how="outer")
    idx = idx.union(idx1)

# print(f"Total num of non dup in all vcf are {len(df)}")
print(f"Total num of non dup in all vcf are {len(idx)}")

Example:

file_1.vcf
##some_comments that span many lines
##some_comments that span many lines
#CRHOM    POS    REF    ALT
chr1   111    A       G
chr2   222    G       T
chrY   99999  A       C

file_2.vcf
##some_comments that span many lines
##some_comments that span many lines
#CRHOM    POS    REF    ALT
chr2   444    G       T
chr2   222    G       T
chrY   7473  C       A

file_3.vcf
##some_comments that span many lines
##some_comments that span many lines
#CRHOM    POS    REF    ALT
chr2   444    G       T
chr3   333    G       T
chrY   7473  C       T

Apply this modification on the code above and run:

26c26
<     df = df.merge(df1, on=COLS, how="outer")
---
>     df = df.merge(df1, on=COLS, how="outer", indicator=f"{vcf.stem}")
>>> df
  CRHOM    POS REF ALT      file_2      file_3
0  chr1    111   A   G   left_only   left_only
1  chr2    222   G   T        both   left_only
2  chrY  99999   A   C   left_only   left_only
3  chr2    444   G   T  right_only        both
4  chrY   7473   C   A  right_only   left_only
5  chr3    333   G   T         NaN  right_only
6  chrY   7473   C   T         NaN  right_only
10
On

If the vcf files are sorted and no uniques are within one file, you can use heapq.merge to merge all files together and then use itertools.groupby to group them (I'm supposing we want to count uniques on all columns). Then count number of unique groups. Example

import vcf
from heapq import merge
from itertools import groupby

filelist = ["file_1.vcf", "file_2.vcf"]  # <--- create the list from `glob()` for example
vcfs = [vcf.Reader(filename=f) for f in filelist]

uniq = sum(1 for _ in groupby(merge(*vcfs)))

print("Total number of unique records:", uniq)

Prints (in my example case):

Total number of unique records: 7

EDIT: To count uniques only on CHROM, POS, REF, ALT columns:

import vcf
from heapq import merge
from itertools import groupby

filelist = ["file_1.vcf", "file_2.vcf"]
vcfs = [
    ((row.CHROM, row.POS, row.REF, row.ALT) for row in vcf.Reader(filename=f))
    for f in filelist
]

uniq = sum(1 for _ in groupby(merge(*vcfs)))

print("Total number of unique records:", uniq)