Editing Arules Data Frame in R

426 Views Asked by At

Hi there I have transform my arules into data frame for further analysis but the problem is my data frame looks like this:

df <- data.frame(rules=c("{45107} => {62557}","{17759} => {60521 }",
"{53721} => {53720}","{63830} => {17753}","{45413} => {45412}",
"{3885,59800,17759} => {4749}","{17721,55906} => {9314}"))

    rules
{45107} => {62557}
{17759} => {60521 }
{53721} => {53720}
{63830} => {17753}
{45413} => {45412}
{3885,59800,17759} => {4749}
{17721,55906} => {9314}

Can you help me change my data frame into this format?

lhs1    lhs2    lhs3    rhs
45107           62557
17759           60521
53721           53720
63830           17753
45413           45412
3885    59800   17759   4749
17721   55906   9314
3

There are 3 best solutions below

4
On BEST ANSWER

With your data.frame df and putting all numbers after => in rhs :

# define the number of maximum "lhs", there is 2 options :
   # option 1, if there are few rules and number of maximum "lhs" is obvious :
maxlhs<-3
   # option 2, if there are many many rules and you don't want to count all "lhs" :
maxlhs<-max(sapply(df$rules,FUN=function(x)length(gregexpr(',',x)[[1]]))) + 1 

# create your new data.frame by "reformatting" the rules
newdf<-t(apply(df,1,function(rule,maxlhs){
                split1<-strsplit(gsub("[ }{]","",rule),"=>")[[1]]
                split2<-strsplit(split1[1],",")[[1]]
                split2<-c(split2,rep(NA,maxlhs-length(split2)))
                return(as.numeric(c(split2,split1[2])))
                    },maxlhs=maxlhs))
# name the new data.frame's columns
colnames(newdf)<-c(paste0("lhs",1:maxlhs),"rhs")

> newdf
      lhs1  lhs2  lhs3   rhs
[1,] 45107    NA    NA 62557
[2,] 17759    NA    NA 60521
[3,] 53721    NA    NA 53720
[4,] 63830    NA    NA 17753
[5,] 45413    NA    NA 45412
[6,]  3885 59800 17759  4749
[7,] 17721 55906    NA  9314

Is that ok or do you want the new data.frame to be exactly like the one displayed in your question ?

0
On
# your data
library(stringr)
data <- structure(list(rules = c("{45107} => {62557}", "{17759} => {60521 }", "{53721} =>     {53720}", "{63830} => {17753}", "{45413} => {45412}", "{3885,59800,17759} => {4749}", "{17721,55906} => {9314}")), .Names = "rules", class = "data.frame", row.names = c(NA, -7L))

# extract all numbers
lhs <- lapply(data, function(x) str_extract_all(x, "\\d+"))$rules
mx <- max(sapply(lhs, length))

do.call("rbind", lapply(lhs, function(x){
  if(length(x) < mx){
   return(c(unlist(x)[-length(x)], matrix(NA, 1, mx - length(x)), unlist(x)[length(x)]))
   } else {
   return(x)
}}))

     [,1]    [,2]    [,3]    [,4]   
[1,] "45107" NA      NA      "62557"
[2,] "17759" NA      NA      "60521"
[3,] "53721" NA      NA      "53720"
[4,] "63830" NA      NA      "17753"
[5,] "45413" NA      NA      "45412"
[6,] "3885"  "59800" "17759" "4749" 
[7,] "17721" "55906" NA      "9314" 
0
On

You could also do something like this, which should be quite efficient.

library(splitstackshape)  ## for cSplit() and loads data.table package

dt <- data.table(
    do.call(rbind, strsplit(gsub("[{} ]", "", df$rules), "=>"))
)
cbind(cSplit(dt[, .(V1)], "V1", ","), dt[, .(V2)])

#     V1_1  V1_2  V1_3    V2
# 1: 45107    NA    NA 62557
# 2: 17759    NA    NA 60521
# 3: 53721    NA    NA 53720
# 4: 63830    NA    NA 17753
# 5: 45413    NA    NA 45412
# 6:  3885 59800 17759  4749
# 7: 17721 55906    NA  9314