How to merge multiple files with two common columns, and name the added col as file name?

212 Views Asked by At

I'm trying to merge multiple .bed files by identifying the first two columns chr and start following this,

Merging multiple files with two common columns, and replace the blank to 0

However, I'm wondering how to make the file name a newly added column name.

$cat combineFWPS_02.sh

    BEGIN {
       for (k=1; k<ARGC; ++k)
          s = s " " 0
    }
    FNR == 1 {
       ++ARGIND
    }
    {
       key=$1 OFS $2
       if (!(key in map))
          map[key] = s
       split(map[key], a)
       a[ARGIND] = $3
       v = ""
       for (k=1; k<ARGC; ++k)
          v = v " " a[k]
       map[key]=v
    }
    END {
       for (k in map)
          print k map[k]
    }

$cat comRwps_02.sh

awkCOM="~/scripts/combineFWPS_02.sh"
## Run the jobs
time awk -f $awkCOM *.xyz.bed | sort -k1 >  13jLiC.xyz.txt 

The input files look like this:

FF85561.xyz.bed:

chr1 111001 234
chr2 22099  108
chr5 463100 219

FF85574.xyz.bed:

chr1 111001 42
chr1 430229 267
chr5 663800 319

FF85631.xyz.bed:

chr1 111001 92
chr3 22099  144
chr5 663800 311

FF85717.xyz.bed:

chr1 111001 129
chr1 157901 79
chr2 22099  442

The expected output file would be

$head 13jLiC.xyz.txt

chr    start    FF85561    FF85574    FF85631    FF85717
chr1   111001    234         42          92         129
chr1   157901      0          0           0          79
chr1   430229      0        267           0           0
chr2    22099    108          0           0         442
chr3    22099      0          0         144           0
chr5   463100    219          0           0           0
chr5   663800      0        319         311           0
4

There are 4 best solutions below

0
On BEST ANSWER

Using any awk:

$ cat tst.awk
BEGIN {
    OFS = "\t"
    vals[++numRows,++numCols] = "chr"
    vals[numRows,++numCols] = "start"
}
FNR == 1 {
    val = FILENAME
    sub(/\..*/,"",val)
    vals[1,++numCols] = val
}
{
    key = $1 FS $2
    if ( !(key in key2rowNr) ) {
        key2rowNr[key] = ++numRows
        vals[numRows,1] = $1
        vals[numRows,2] = $2
    }
    rowNr = key2rowNr[key]
    vals[rowNr,numCols] = $3
}
END {
    for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
        for ( colNr=1; colNr<=numCols; colNr++ ) {
            val = ( (rowNr,colNr) in vals ? vals[rowNr,colNr] : 0 )
            row = ( colNr>1 ? row OFS : "" ) val
        }
        print row
    }
}

$ awk -f tst.awk *.bed
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr2    22099   108     0       0       442
chr5    463100  219     0       0       0
chr1    430229  0       267     0       0
chr5    663800  0       319     311     0
chr3    22099   0       0       144     0
chr1    157901  0       0       0       79

and if you want the rows sorted then you can apply a Decorate-Sort-Undecorate approach:

$ awk -f tst.awk *.bed | awk -v OFS='\t' '{print (NR>1), $0}' | sort -k1,1n -k2,2 -k3,3n | cut -f2-
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr1    157901  0       0       0       79
chr1    430229  0       267     0       0
chr2    22099   108     0       0       442
chr3    22099   0       0       144     0
chr5    463100  219     0       0       0
chr5    663800  0       319     311     0

If the chr<number> string can ever have more than 1 digit at the end and you want those sorted both alphabetically and numerically (e.g. so chr2 comes before chr10) then you'd have to change the DSU part to something like:

$ awk -f tst.awk *.bed | awk -v OFS='\t' '{c=$1; sub(/[[:alpha:]]+/,"&" OFS,c); print (NR>1), c, $0}' | sort -k1,1n -k2,2 -k3,3 -k5,5n | cut -f4-
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr1    157901  0       0       0       79
chr1    430229  0       267     0       0
chr2    22099   108     0       0       442
chr3    22099   0       0       144     0
chr5    463100  219     0       0       0
chr5    663800  0       319     311     0
0
On

With GNU awk (for the multi-dimensional arrays, sorting, BEGINFILE and ARGIND):

awk '
  BEGIN {h="chr" OFS "start"; PROCINFO["sorted_in"]="@ind_str_asc"}
  BEGINFILE {f=FILENAME; sub(/\..*/,"",f); h=h OFS f}
  {a[$1 OFS $2][ARGIND]=$3}
  END {
    print h
    for(i in a) {s=i; for(j=1; j<ARGC; j++) s=s OFS a[i][j]+0; print s}
  }' *.bed | column -t

PROCINFO["sorted_in"]="@ind_str_asc" defines the order of for(i in a) as by indices in ascending order compared as strings. This avoids using sort to obtain the expected output order.

0
On

In GNU awk, with your shown samples and attempts please try following awk solution.

awk '
BEGIN { OFS="\t" }
FNR==1{
  split(FILENAME,fileName,".")
  files[++count]=fileName[1]
}
{
  baseArr[$1 OFS $2]
  arr1[$1 OFS $2]=(arr1[$1 OFS $2]?arr1[$1 OFS $2] OFS:"") $3
  keyPresense[$1 OFS $2 OFS fileName[1]]=$3
}
END{
  printf("%s ","chr start")
  for(i=1;i<=count;i++){
    printf("%s%s",files[i],i==count?ORS:OFS)
  }
  for(j in baseArr){
    for(k=1;k<=count;k++){
      if(j OFS files[k] in keyPresense){
         foundCount++
      }
    val=(val?val OFS:"") (j OFS files[k] in keyPresense? OFS keyPresense[j OFS files[k]]:"0")
    }
    print j,foundCount==length(files)?arr1[j]:val
    val=foundCount=""
  }
}
' *.bed | column -t | sort -sk1
0
On

Either add at the beginning of the END block

    t="chr start"; for(k=1;k<ARGC;++k) t=t OFS ARGV[k]; gsub(/.xyz.bed/,"",t); print t
    # can use " " instead of OFS if you like; I think OFS explains its purpose better

or merge it into the BEGIN block like

    BEGIN { t="chr start"
       for (k=1; k<ARGC; ++k)
          s = s " " 0; t = t OFS ARGV[k] # ditto
       gsub(/.xyz.bed/,"",t); print t
    }

For the values you've shown, your external sort (which wasn't needed with the GNU-awk method in the linked Q because gawk can deliver the data lines already sorted) should leave the header at the beginning. If other values can occur that wo9uldn't do so, either do the sort in the awk script by making (the data part of) the END block

       for (k in map)
          print k map[k] | "sort -k1" # see below

or use this shell hack to exclude the header from the sort

awk ... | { read -r tmp; printf '%s\n' $tmp; sort -k1; } 

and in either case you appear to want to sort on the combination of 'chr' and 'start' so you should have instead -k1,2 .

And finally note in any case, including your original and the gawk version, you need to pipe through column -t to get the visually-aligned columns.