R : Mapping mutually exclusive columns to a single column

254 Views Asked by At

I have a train data set which has these 40 soil_types name soil_type1 to soil type40.

 Id ..... Elevation s1 s2 s3 s4 s5.........s40
  1 .....       347  0  1  0  0  0           0
  2 .....       354  0  0  0  1  0           0
  3 .....       554  0  0  1  0  0           0

I want to merge these columns s1 to s40 into a single column s like this.

 Id ..... Elevation  s
 1  .....       347 s2
 2  .....       354 s4
 3  .....       554 s3

I can think of doing like this but there is got to be better way in R.

 train$s <- NA
    train$s[trains$S1 == 1] <- s1
    train$s[trains$S2 == 1] <- s2
    .
    .
    .
    train$s[trains$S29 == 1] <- s29

Edit : Please note that there are other data column present

4

There are 4 best solutions below

0
On

We can subset the 's' column, get the index with max.col and cbind with the first column

i1 <- grep("^s\\d+", colnames(train))
cbind(train, s= max.col(train[i1], "first"))
#  Id Elevation s1 s2 s3 s4 s5 s40 s
#1  1       347  0  1  0  0  0   0 2
#2  2       354  0  0  0  1  0   0 4
#3  3       554  0  0  1  0  0   0 3

Or another efficient option is

cbind(train, s= as.matrix(train[i1])%*% 1:ncol(train[i1]))
#   Id Elevation s1 s2 s3 s4 s5 s40 s
#1  1       347  0  1  0  0  0   0 2
#2  2       354  0  0  0  1  0   0 4
#3  3       554  0  0  1  0  0   0 3

data

train <- structure(list(Id = 1:3, Elevation = c(347L, 354L, 554L), s1 = c(0L, 
0L, 0L), s2 = c(1L, 0L, 0L), s3 = c(0L, 0L, 1L), s4 = c(0L, 1L, 
0L), s5 = c(0L, 0L, 0L), s40 = c(0L, 0L, 0L)), .Names = c("Id", 
"Elevation", "s1", "s2", "s3", "s4", "s5", "s40"), class = "data.frame", 
row.names = c(NA, -3L))
1
On

here iterate through the rows, then check which column had 1 and return its position

df$s = apply(df[-1], 1, function(x) which(x == 1))
# df
#  Id s1 s2 s3 s4 s5 s40 s
#1  1  0  1  0  0  0   0 2
#2  2  0  0  0  1  0   0 4
#3  3  0  0  1  0  0   0 3
2
On

I have a solution with rshape2 and dplyr:

require(reshape2)
require(dplyr)
df <- data.frame(ID=seq(1:10), s0=rep(0,10), s1=rep(0,10),s2=rep(0,10),s3=rep(0,10),s4=rep(0,10),s5=rep(0,10),s6=rep(0,10))
df$s0[1] = 1
df$s1[2] = 1
df$s2[3] = 1
df$s3[4] = 1
df$s4[5] = 1
df$s5[6] = 1
df <- melt(df,id=c("ID")) %>%
        rename(s=variable) %>%
        filter(value==1) %>%
        select(-value)
0
On

Another base R option using which with the arr.ind = TRUE argument to return the column within the s variables where each row is 1.

ones <- which(df[grep("^s\\d+", names(df))] == 1, arr.ind=TRUE)
dfNew <- cbind(df[1:2], "s" = ones[ones[, 1], 2])

  Id Elevation s
1  1       347 2
2  2       354 4
3  3       554 3

data

df <- read.table(header=TRUE, text="Id  Elevation s1 s2 s3 s4 s5 s40
1 347  0  1  0  0  0 0
2 354  0  0  0  1  0 0
3  554  0  0  1  0  0 0")