Compute interval overlap percentage between different sources

50 Views Asked by At

I have a dataframe containing intervals coming from different sources (A, B, C). I would like to compute pairewise overlap percentage between the interval of each sources. I wrote these following command lines. I was wondering if there is an easier/faster way to do that (my data are bigger than this example) and a way to make this more adaptable if I have a fourth source to include.


data=data.frame(StartA=c(134000,765888,243634,576098,398776,128598,NA), StopA=c(181654,842465, 244377, 582626,  399102, 129893, NA),
                StartB=c(134023,765880,243634,576098,NA,128598,849356), StopB=c(181654,842465, 244352, 582626,  NA, 129893, 868654),
                StartC=c(132065,NA,NA,592626,398776,128698,867656), StopC=c(191604,NA, NA, 593391,  399102, 129993, 868654))

data
   StartA  StopA StartB  StopB StartC  StopC
1 134000 181654 134023 181654 132065 191604
2 765888 842465 765880 842465     NA     NA
3 243634 244377 243634 244352     NA     NA
4 576098 582626 576098 582626 592626 593391
5 398776 399102     NA     NA 398776 399102
6 128598 129893 128598 129893 128698 129993
7     NA     NA 849356 868654 867656 868654

# For loop to compute each overlap for each row
#################################################
data$overlap_reciproq_AB=NA
data$overlap_reciproq_BC=NA
data$overlap_reciproq_AC=NA

for (i in 1:nrow(data)) {

# overlap A vs B 
  
  if(!is.na(data$StartA[i]) & !is.na(data$StopA[i]) & !is.na(data$StartB[i]) & !is.na(data$StopB[i])) {
  overlapAB = max(0, as.numeric(min(data$StopA[i],data$StopB[i]))-as.numeric(max(data$StartA[i], data$StartB[i])))
  overlap_A_B = overlapAB / (data$StopA[i] -  data$StartA[i]) *100
  overlap_B_A = overlapAB / (data$StopB[i] -  data$StartB[i]) *100
  data$overlap_reciproq_AB[i] = min(overlap_A_B, overlap_B_A)
  }

  # overlap A vs C 
  
  if(!is.na(data$StartA[i]) & !is.na(data$StopA[i]) & !is.na(data$StartC[i]) & !is.na(data$StopC[i])) {
    overlapAC = max(0, as.numeric(min(data$StopA[i],data$StopC[i]))-as.numeric(max(data$StartA[i], data$StartC[i])))
    overlap_A_C = overlapAC / (data$StopA[i] -  data$StartA[i]) *100
    overlap_C_A = overlapAC / (data$StopC[i] -  data$StartC[i]) *100
    data$overlap_reciproq_AC[i] = min(overlap_A_C, overlap_C_A)
  
  }
  
  # overlap B vs C 
  
  if(!is.na(data$StartC[i]) & !is.na(data$StopC[i]) & !is.na(data$StartB[i]) & !is.na(data$StopB[i])) {
    overlapBC = max(0, as.numeric(min(data$StopB[i],data$StopC[i]))-as.numeric(max(data$StartB[i], data$StartC[i])))
    overlap_B_C = overlapBC / (data$StopB[i] -  data$StartB[i]) *100
    overlap_C_B = overlapBC / (data$StopC[i] -  data$StartC[i]) *100
    data$overlap_reciproq_BC[i] = min(overlap_B_C, overlap_C_B)
    
  }
}


# Output
#############################

  StartA  StopA StartB  StopB StartC  StopC overlap_reciproq_AB overlap_reciproq_BC overlap_reciproq_AC
1 134000 181654 134023 181654 132065 191604            99.95174            79.99966            79.99966
2 765888 842465 765880 842465     NA     NA            99.98955                  NA                  NA
3 243634 244377 243634 244352     NA     NA            96.63526                  NA                  NA
4 576098 582626 576098 582626 592626 593391           100.00000             0.00000             0.00000
5 398776 399102     NA     NA 398776 399102                  NA                  NA           100.00000
6 128598 129893 128598 129893 128698 129993           100.00000            92.27799            92.27799
7     NA     NA 849356 868654 867656 868654                  NA             5.17152                  NA
1

There are 1 best solutions below

0
CPB On

One approach which should scale to larger data sets and more sources uses foverlaps from the data.table package. Note the ordering of the overlap columns is alphabetical in the final output.

library(data.table)
library(magrittr)

data=data.table(
  StartA=c(134000,765888,243634,576098,398776,128598,NA), 
  StopA=c(181654,842465, 244377, 582626,  399102, 129893, NA),
  StartB=c(134023,765880,243634,576098,NA,128598,849356), 
  StopB=c(181654,842465, 244352, 582626,  NA, 129893, 868654),
  StartC=c(132065,NA,NA,592626,398776,128698,867656), 
  StopC=c(191604,NA, NA, 593391,  399102, 129993, 868654))

# Convert to the appropriate format for foverlaps
dt <- melt(data, variable.factor = FALSE)
dt[, type := substr(variable, nchar(variable), nchar(variable))]
dt[, event := substr(variable, 1, nchar(variable) - 1)]
dt[, index := 1:.N, by = variable]

dt_wide <- dcast(dt[!is.na(value)], 
                 type + index ~ event, 
                 value.var = 'value',
                 fun.aggregate = mean)
setkey(dt_wide, Start, Stop)

# Calculate the overlaps
olap <- foverlaps(dt_wide, dt_wide)
olap <- olap[!(type == i.type & index == i.index) & type < i.type]

olap[, overlap := pmin(Stop, i.Stop) - pmax(Start, i.Start)]
olap[, dir := paste0(type, i.type)]
olap[, dir_prop := overlap/(Stop - Start)]
olap[, rev_dir := paste0(i.type, type)]
olap[, rev_dir_prop := overlap/(i.Stop - i.Start)]

# Calculate the final percentages
olap[, min_overlap := 100*pmin(dir_prop, rev_dir_prop)]

# Restructure to the output format in the question
index_data <- dcast(dt, 
                    index ~ type + event, 
                    value.var = 'value',
                    fun.aggregate = mean)
overlaps <- dcast(olap, index ~ dir, value.var = 'min_overlap') %>%
  setnames(., names(.), paste0('overlap_reciproq_', names(.)))
out <- overlaps[index_data, on = .(overlap_reciproq_index = index)]
setcolorder(out, setdiff(names(index_data), 'index'))

# Delete the extra index columns.
out[, overlap_reciproq_index := NULL]

   A_Start A_Stop B_Start B_Stop C_Start C_Stop overlap_reciproq_AB overlap_reciproq_AC overlap_reciproq_BC
     <num>  <num>   <num>  <num>   <num>  <num>               <num>               <num>               <num>
1:  134000 181654  134023 181654  132065 191604            99.95174            80.03829            79.99966
2:  765888 842465  765880 842465      NA     NA            99.98955                  NA                  NA
3:  243634 244377  243634 244352      NA     NA            96.63526                  NA                  NA
4:  576098 582626  576098 582626  592626 593391           100.00000                  NA                  NA
5:  398776 399102      NA     NA  398776 399102                  NA           100.00000                  NA
6:  128598 129893  128598 129893  128698 129993           100.00000            92.27799            92.27799
7:      NA     NA  849356 868654  867656 868654                  NA                  NA             5.17152