In R, how to transform row values into new columns

Question

In R, how to transform row values into new columns

121 Views Asked by Jeff Johnson At 17 December 2014 at 21:34

I have a dataframe "mydf" created with the following:

x <- rep(1,100)
y <- rnorm(100,1)
z <- rep("A",25)
z <- append(append(append(z, rep("B",25)), rep("C",25)), rep("D",25))
mydf <- as.data.frame(cbind(x,y,z))

As you'll notice, z has four distinct values (A, B, C and D). Now what I want to do is create four new fields named z_A, z_B, z_C and z_D where each is populated with a 0 or 1. If mydf$z==A, then z_A should be 1. If mydf$z==B, then z_B should be 1 otherwise 0, and so on for each of the values.

While I only have four distinct values in mydf$z now, I'm trying to get to a general purpose function I can use to binarize any column which can have any number of distinct values.

I've looked at another example on StackOverflow such as:

a <- model.matrix( ~ z - 1, data=mydf)

which works great when the column has only character values.

> a
    zA zB zC zD
1    1  0  0  0
2    1  0  0  0
3    1  0  0  0
4    1  0  0  0
5    1  0  0  0
6    1  0  0  0
7    1  0  0  0
8    1  0  0  0
9    1  0  0  0
10   1  0  0  0

When I try it with column z as a numeric variable:

x <- rep(1,100)
y <- rnorm(100,1)
z <- rep(1,25)
z <- append(append(append(z, rep(2,25)), rep(3,25)), rep(4,25))
mydf <- as.data.frame(cbind(x,y,z))
c <- model.matrix( ~ z - 1, data=mydf)
c

I get:

If I try setting the desired variable as character, I get the following which is fine except the field names are all screwy:

c <- model.matrix( ~ as.character(z) - 1, data=mydf)
> c
    as.character(z)1 as.character(z)2 as.character(z)3 as.character(z)4
1                  1                0                0                0
2                  1                0                0                0
3                  1                0                0                0
4                  1                0                0                0
5                  1                0                0                0
6                  1                0                0                0
7                  1                0                0                0
8                  1                0                0                0
9                  1                0                0                0
10                 1                0                0                0

Please let me know if there's additional info that would be helpful to solve this.

Thank you!

EDIT: Here's an example of some actual data per Howard's feedback.

dput(head(df,100))

structure(list(datetime = structure(c(14975, 14975, 14975, 14975, 
14975, 14975, 14975, 14975, 14975, 14975, 14975, 14975, 14975, 
14975, 14975, 14975, 14975, 14975, 14975, 14975, 14975, 14975, 
14975, 14975, 14976, 14976, 14976, 14976, 14976, 14976, 14976, 
14976, 14976, 14976, 14976, 14976, 14976, 14976, 14976, 14976, 
14976, 14976, 14976, 14976, 14976, 14976, 14976, 14977, 14977, 
14977, 14977, 14977, 14977, 14977, 14977, 14977, 14977, 14977, 
14977, 14977, 14977, 14977, 14977, 14977, 14977, 14977, 14977, 
14977, 14977, 14978, 14978, 14978, 14978, 14978, 14978, 14978, 
14978, 14978, 14978, 14978, 14978, 14978, 14978, 14978, 14978, 
14978, 14978, 14978, 14978, 14978, 14978, 14978, 14979, 14979, 
14979, 14979, 14979, 14979, 14979, 14979), class = "Date"), season = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), holiday = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), workingday = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), weather = c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), temp = c(9.84, 9.02, 
9.02, 9.84, 9.84, 9.84, 9.02, 8.2, 9.84, 13.12, 15.58, 14.76, 
17.22, 18.86, 18.86, 18.04, 17.22, 18.04, 17.22, 17.22, 16.4, 
16.4, 16.4, 18.86, 18.86, 18.04, 17.22, 18.86, 18.86, 17.22, 
16.4, 16.4, 15.58, 14.76, 14.76, 14.76, 14.76, 14.76, 13.94, 
13.94, 13.94, 14.76, 13.12, 12.3, 10.66, 9.84, 9.02, 9.02, 8.2, 
6.56, 6.56, 5.74, 5.74, 5.74, 6.56, 7.38, 8.2, 9.02, 9.84, 10.66, 
10.66, 10.66, 9.84, 9.84, 8.2, 8.2, 7.38, 5.74, 7.38, 6.56, 6.56, 
5.74, 5.74, 4.92, 4.92, 4.92, 5.74, 6.56, 6.56, 9.02, 9.02, 9.84, 
10.66, 11.48, 12.3, 11.48, 10.66, 9.84, 9.84, 9.02, 9.02, 8.2, 
8.2, 6.56, 6.56, 9.84, 9.02, 8.2, 7.38, 8.2), atemp = c(14.395, 
13.635, 13.635, 14.395, 14.395, 12.88, 13.635, 12.88, 14.395, 
17.425, 19.695, 16.665, 21.21, 22.725, 22.725, 21.97, 21.21, 
21.97, 21.21, 21.21, 20.455, 20.455, 20.455, 22.725, 22.725, 
21.97, 21.21, 22.725, 22.725, 21.21, 20.455, 20.455, 19.695, 
17.425, 16.665, 16.665, 17.425, 17.425, 16.665, 16.665, 16.665, 
16.665, 14.395, 13.635, 11.365, 10.605, 11.365, 9.85, 8.335, 
6.82, 6.82, 5.305, 6.82, 6.06, 6.82, 8.335, 9.09, 10.605, 10.605, 
12.12, 12.12, 12.12, 11.365, 12.88, 12.88, 11.365, 9.85, 7.575, 
10.605, 9.09, 9.09, 7.575, 9.09, 7.575, 7.575, 7.575, 7.575, 
7.575, 6.82, 10.605, 11.365, 11.365, 12.88, 13.635, 14.395, 13.635, 
12.88, 12.88, 12.88, 13.635, 12.88, 11.365, 12.88, 9.85, 9.85, 
11.365, 11.365, 9.85, 9.09, 9.09), humidity = c(81L, 80L, 80L, 
75L, 75L, 75L, 80L, 86L, 75L, 76L, 76L, 81L, 77L, 72L, 72L, 77L, 
82L, 82L, 88L, 88L, 87L, 87L, 94L, 88L, 88L, 94L, 100L, 94L, 
94L, 77L, 76L, 71L, 76L, 81L, 71L, 66L, 66L, 76L, 81L, 71L, 57L, 
46L, 42L, 39L, 44L, 44L, 47L, 44L, 44L, 47L, 47L, 50L, 50L, 50L, 
43L, 43L, 40L, 35L, 35L, 30L, 30L, 30L, 30L, 32L, 47L, 47L, 64L, 
69L, 55L, 55L, 59L, 63L, 63L, 68L, 74L, 74L, 69L, 64L, 69L, 51L, 
51L, 56L, 52L, 52L, 49L, 48L, 48L, 48L, 48L, 64L, 64L, 69L, 64L, 
74L, 74L, 48L, 47L, 47L, 43L, 40L), windspeed = c(0, 0, 0, 0, 
0, 6.0032, 0, 0, 0, 0, 16.9979, 19.0012, 19.0012, 19.9995, 19.0012, 
19.9995, 19.9995, 19.0012, 16.9979, 16.9979, 16.9979, 12.998, 
15.0013, 19.9995, 19.9995, 16.9979, 19.0012, 12.998, 12.998, 
19.9995, 12.998, 15.0013, 15.0013, 15.0013, 16.9979, 19.9995, 
8.9981, 12.998, 11.0014, 11.0014, 12.998, 22.0028, 30.0026, 23.9994, 
22.0028, 19.9995, 11.0014, 23.9994, 27.9993, 26.0027, 19.0012, 
26.0027, 12.998, 19.0012, 26.0027, 16.9979, 22.0028, 19.9995, 
19.0012, 19.0012, 16.9979, 16.9979, 15.0013, 7.0015, 0, 7.0015, 
8.9981, 8.9981, 7.0015, 7.0015, 7.0015, 8.9981, 6.0032, 7.0015, 
7.0015, 8.9981, 11.0014, 15.0013, 22.0028, 19.9995, 11.0014, 
12.998, 15.0013, 16.9979, 16.9979, 15.0013, 12.998, 7.0015, 7.0015, 
0, 6.0032, 6.0032, 0, 6.0032, 6.0032, 15.0013, 11.0014, 15.0013, 
12.998, 19.9995), casual = c(3L, 8L, 5L, 3L, 0L, 0L, 2L, 1L, 
1L, 8L, 12L, 26L, 29L, 47L, 35L, 40L, 41L, 15L, 9L, 6L, 11L, 
3L, 11L, 15L, 4L, 1L, 1L, 2L, 2L, 0L, 0L, 0L, 1L, 7L, 16L, 20L, 
11L, 4L, 19L, 9L, 7L, 10L, 1L, 5L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 7L, 11L, 10L, 13L, 8L, 11L, 14L, 9L, 11L, 9L, 8L, 
3L, 3L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 2L, 2L, 5L, 7L, 12L, 
18L, 9L, 17L, 15L, 10L, 3L, 2L, 1L, 0L, 1L, 2L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 3L), registered = c(13L, 32L, 27L, 10L, 1L, 1L, 0L, 
2L, 7L, 6L, 24L, 30L, 55L, 47L, 71L, 70L, 52L, 52L, 26L, 31L, 
25L, 31L, 17L, 24L, 13L, 16L, 8L, 4L, 1L, 2L, 1L, 8L, 19L, 46L, 
54L, 73L, 64L, 55L, 55L, 67L, 58L, 43L, 29L, 17L, 20L, 9L, 8L, 
5L, 2L, 1L, 3L, 30L, 63L, 153L, 81L, 33L, 41L, 48L, 53L, 66L, 
58L, 67L, 146L, 148L, 102L, 49L, 49L, 20L, 11L, 5L, 2L, 1L, 2L, 
4L, 36L, 92L, 177L, 98L, 37L, 50L, 66L, 79L, 54L, 48L, 68L, 202L, 
179L, 110L, 53L, 48L, 34L, 9L, 6L, 6L, 2L, 2L, 3L, 33L, 87L, 
192L), count = c(16L, 40L, 32L, 13L, 1L, 1L, 2L, 3L, 8L, 14L, 
36L, 56L, 84L, 94L, 106L, 110L, 93L, 67L, 35L, 37L, 36L, 34L, 
28L, 39L, 17L, 17L, 9L, 6L, 3L, 2L, 1L, 8L, 20L, 53L, 70L, 93L, 
75L, 59L, 74L, 76L, 65L, 53L, 30L, 22L, 31L, 9L, 8L, 5L, 2L, 
1L, 3L, 30L, 64L, 154L, 88L, 44L, 51L, 61L, 61L, 77L, 72L, 76L, 
157L, 157L, 110L, 52L, 52L, 20L, 12L, 5L, 2L, 1L, 2L, 4L, 36L, 
94L, 179L, 100L, 42L, 57L, 78L, 97L, 63L, 65L, 83L, 212L, 182L, 
112L, 54L, 48L, 35L, 11L, 6L, 6L, 2L, 2L, 3L, 33L, 88L, 195L)), .Names = c("datetime", 
"season", "holiday", "workingday", "weather", "temp", "atemp", 
"humidity", "windspeed", "casual", "registered", "count"), row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", 
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", 
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", 
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", 
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", 
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", 
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", 
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100"), class = "data.frame")

In this case I want to create df$season1, df$season2, df$season3 and df$season4. My question in this case is do I have to type out every column in the dataset (see dcast below) just to add the season1, season2, season3 and season4 field? AND, is there a way to name them season1, season2, season3, season4 rather than 1,2,3,4?

 library(reshape2)
    result <- dcast(df,datetime + holiday + workingday + weather + 
                      temp + atemp + humidity + windspeed + casual + 
    registered + count ~ season,length)

Original Q&A

There are 2 best solutions below

**jlhoward** · Answer 1 · 2014-12-17T21:52:15.497000

Like this?

library(reshape2)
result <- dcast(mydf,x+y~z,length)
head(result)
#   x                     y A B C D
# 1 1 -0.000774423151445491 0 0 1 0
# 2 1   -0.0206607799566461 1 0 0 0
# 3 1   -0.0374524778732928 1 0 0 0
# 4 1    -0.155943108886233 0 0 1 0
# 5 1    -0.174550152789981 0 0 0 1
# 6 1    -0.183201143770777 0 0 1 0

This does sort by x and y, which may not be desirable.

**nicola** · Answer 2 · 2014-12-17T21:52:28.557000

You can try:

  cbind(mydf,sapply(levels(mydf$z), function(x) as.numeric(mydf$z==x)))     
  #    x                   y z A B C D
  #1   1    1.68876859502969 A 1 0 0 0
  #2   1   0.408336901490438 A 1 0 0 0
  #3   1  -0.115258225333019 A 1 0 0 0
  #4   1    1.64286711975319 A 1 0 0 0
  #5   1   0.393874649038152 A 1 0 0 0
  #6   1    1.55492320680662 A 1 0 0 0

In R, how to transform row values into new columns

There are 2 best solutions below

Related Questions in R

Related Questions in MODEL.MATRIX

Trending Questions

Popular # Hahtags

Popular Questions