Pairwise combinations along with counts in data.table

287 Views Asked by At

I have a data.frame d as follows.

d <- structure(list(sno = 1:7, list = c("SD1, SD44, SD384, SD32", 
"SD23, SD1, SD567", "SD42, SD345, SD183", "SD345, SD340, SD387", 
"SD455, SD86, SD39", "SD12, SD315, SD387", "SD32, SD1, SD40")), .Names = c("sno", 
"list"), row.names = c(NA, -7L), class = "data.frame")

d
  sno                   list
1   1 SD1, SD44, SD384, SD32
2   2       SD23, SD1, SD567
3   3     SD42, SD345, SD183
4   4    SD345, SD340, SD387
5   5      SD455, SD86, SD39
6   6     SD12, SD315, SD387
7   7        SD32, SD1, SD40

I want to get the pairwise combinations of all the strings separated by ", " in d$list.

I can get it using lapply as follows.

d2 <- strsplit(d$list, split = ", ")
d2 <- lapply(d2, function(x) as.data.frame(t(combn(x, m=2))))
library(data.table)
d2 <- rbindlist(d2)

I wan't to have the counts of each group in d$list along with the combined list d2 as a new column. How to do this with data.table?

library(stringi)
stri_count_fixed(d$list,", ")

The desired output is be as follows

out <- structure(list(V1 = structure(c(1L, 1L, 1L, 3L, 3L, 2L, 4L, 4L, 
1L, 6L, 6L, 5L, 5L, 5L, 7L, 8L, 8L, 9L, 10L, 10L, 11L, 12L, 12L, 
1L), .Label = c("SD1", "SD384", "SD44", "SD23", "SD345", "SD42", 
"SD340", "SD455", "SD86", "SD12", "SD315", "SD32"), class = "factor"), 
    V2 = structure(c(3L, 2L, 1L, 2L, 1L, 1L, 4L, 5L, 5L, 7L, 
    6L, 6L, 8L, 9L, 9L, 11L, 10L, 10L, 12L, 9L, 9L, 4L, 13L, 
    13L), .Label = c("SD32", "SD384", "SD44", "SD1", "SD567", 
    "SD183", "SD345", "SD340", "SD387", "SD39", "SD86", "SD315", 
    "SD40"), class = "factor"), count = c(4, 4, 4, 4, 4, 4, 3, 
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)), .Names = c("V1", 
"V2", "count"), row.names = c(NA, -24L), class = "data.frame")


out 
      V1    V2 count
1    SD1  SD44     4
2    SD1 SD384     4
3    SD1  SD32     4
4   SD44 SD384     4
5   SD44  SD32     4
6  SD384  SD32     4
7   SD23   SD1     3
8   SD23 SD567     3
9    SD1 SD567     3
10  SD42 SD345     3
11  SD42 SD183     3
12 SD345 SD183     3
13 SD345 SD340     3
14 SD345 SD387     3
15 SD340 SD387     3
16 SD455  SD86     3
17 SD455  SD39     3
18  SD86  SD39     3
19  SD12 SD315     3
20  SD12 SD387     3
21 SD315 SD387     3
22  SD32   SD1     3
23  SD32  SD40     3
24   SD1  SD40     3
1

There are 1 best solutions below

0
On BEST ANSWER

Using gsub we can remove all characters except the delimiter (,), count the number of characters with nchar, add 1 to get the number of words, and create a new column 'Count' using transform. Using cSplit from splitstackshape, we can split the 'list' column by ,, by specifying the direction as long, we reformat the dataset. Loading splitstackshape will also load data.table, so we can use the data.table methods of aggregating. Grouped by 'sno' and 'Count' (.(sno, Count)), we get the combn of 'list', create two columns ('V1', 'V2') based on alternating values from the combn output, and assign the 'sno' column to NULL (if it is not needed)

library(splitstackshape)
d1 <-  transform(d, Count=nchar(gsub('[^,]', '', list))+1L)
cSplit(d1, 'list', ', ', 'long')[, {
       tmp <- combn(as.character(list), 2)
       list(V1=tmp[c(TRUE, FALSE)], V2= tmp[c(FALSE, TRUE)])
         }, .(sno, Count)][, 
         sno:= NULL]
 #   Count    V1    V2
 #1:     4   SD1  SD44
 #2:     4   SD1 SD384
 #3:     4   SD1  SD32
 #4:     4  SD44 SD384
 #5:     4  SD44  SD32
 #6:     4 SD384  SD32
 #7:     3  SD23   SD1
 #8:     3  SD23 SD567
 #9:     3   SD1 SD567
#10:     3  SD42 SD345
#11:     3  SD42 SD183
#12:     3 SD345 SD183
#13:     3 SD345 SD340
#14:     3 SD345 SD387
#15:     3 SD340 SD387
#16:     3 SD455  SD86
#17:     3 SD455  SD39
#18:     3  SD86  SD39
#19:     3  SD12 SD315
#20:     3  SD12 SD387
#21:     3 SD315 SD387
#22:     3  SD32   SD1
#23:     3  SD32  SD40
#24:     3   SD1  SD40

Or modifying your code, we create the 'Count' column in 'd2' using Map/cbind and as described in the post, do rbindlist to collapse the list to a single 'data.table' object.

library(stringi)
library(data.table)
Count <- stri_count_fixed(d$list,", ")+1
d2 <- strsplit(d$list, split = ", ")
d2 <- lapply(d2, function(x) as.data.frame(t(combn(x, m=2))))
rbindlist(Map(cbind, d2, Count=Count))
#      V1    V2 Count
# 1:   SD1  SD44     4
# 2:   SD1 SD384     4
# 3:   SD1  SD32     4
# 4:  SD44 SD384     4
# 5:  SD44  SD32     4
# 6: SD384  SD32     4
# 7:  SD23   SD1     3
# 8:  SD23 SD567     3
# 9:   SD1 SD567     3
#10:  SD42 SD345     3
#11:  SD42 SD183     3
#12: SD345 SD183     3
#13: SD345 SD340     3
#14: SD345 SD387     3
#15: SD340 SD387     3
#16: SD455  SD86     3
#17: SD455  SD39     3
#18:  SD86  SD39     3
#19:  SD12 SD315     3
#20:  SD12 SD387     3
#21: SD315 SD387     3
#22:  SD32   SD1     3
#23:  SD32  SD40     3
#24:   SD1  SD40     3