Means within each group with more than 1 column of group indices

56 Views Asked by At

I have a variable and I would like to obtain the means within each group where the group is listed for each observation in a column and I have many such columns. I would then like to associate the group means to the appropriate observation so that if I start with a matrix of m obs x n different groupings I obtain an m x n matrix of means. For example:

> var <- round(runif(10),digits=2)

> var
[1] 0.47 0.21 0.80 0.65 0.32 0.72 0.29 0.93 0.77 0.64
> groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
           sample(c(1,2,3), 10, replace=TRUE),
           sample(c(1,2,3,5), 10, replace=TRUE))
> groupings
      [,1] [,2] [,3]
 [1,]    3    1    5
 [2,]    1    1    5
 [3,]    2    1    5
 [4,]    3    2    3
 [5,]    2    3    1
 [6,]    1    1    1
 [7,]    2    3    1
 [8,]    1    2    1
 [9,]    3    1    5
[10,]    1    3    2

I can obtain the means within each group separately with the following (for example)

> means.1 <- sapply(split(var, groupings[,1]), function(x) mean(x))
> means.2 <- sapply(split(var, groupings[,2]), function(x) mean(x))
> means.3 <- sapply(split(var, groupings[,3]), function(x) mean(x))

> means.1
    1     2     3 
0.625 0.470 0.630 
> means.2
    1         2         3 
0.5940000 0.7900000 0.4166667 
> means.3
    1      2      3      5 
0.5650 0.6400 0.6500 0.5625 

But not only are these separate calls inefficient, they still don't get me what I want, which is the following

       [,1]      [,2]   [,3]
[1,]  0.630 0.5940000 0.5625
[2,]  0.625 0.5940000 0.5625
[3,]  0.470 0.5940000 0.5625
[4,]  0.630 0.7900000 0.6500
[5,]  0.470 0.4166667 0.5650
[6,]  0.625 0.5940000 0.5650
[7,]  0.470 0.4166667 0.5650
[8,]  0.625 0.7900000 0.5650
[9,]  0.630 0.5940000 0.5625
[10,] 0.625 0.4166667 0.6400
3

There are 3 best solutions below

2
On BEST ANSWER

Another option, you can use apply(because you already have a matrix) to loop through columns( with Margin set to 2) and pass the column to ave function as group variable, you can either explicitly specify FUN parameter to be mean or not specify it as mean is the default function used:

apply(groupings, 2, ave, x = var)  # pass the var as a named parameter since it is the 
                                   # parameter at the first position of ave function, if not
                                   # ave will treat the column as the first position parameter
                                   # which you don't want to

 #      [,1]      [,2]   [,3]
 #[1,] 0.630 0.5940000 0.5625
 #[2,] 0.625 0.5940000 0.5625
 #[3,] 0.470 0.5940000 0.5625
 #[4,] 0.630 0.7900000 0.6500
 #[5,] 0.470 0.4166667 0.5650
 #[6,] 0.625 0.5940000 0.5650
 #[7,] 0.470 0.4166667 0.5650
 #[8,] 0.625 0.7900000 0.5650
 #[9,] 0.630 0.5940000 0.5625
#[10,] 0.625 0.4166667 0.6400

Or with dplyr, you can use the mutate_all() function:

library(dplyr)
mutate_all(as.data.frame(groupings), funs(ave(var, .)))

#      V1        V2     V3
#1  0.630 0.5940000 0.5625
#2  0.625 0.5940000 0.5625
#3  0.470 0.5940000 0.5625
#4  0.630 0.7900000 0.6500
#5  0.470 0.4166667 0.5650
#6  0.625 0.5940000 0.5650
#7  0.470 0.4166667 0.5650
#8  0.625 0.7900000 0.5650
#9  0.630 0.5940000 0.5625
#10 0.625 0.4166667 0.6400
1
On

Here is one method using ave along with split and sapply

sapply(split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))),
       function(x) ave(var, x, FUN=mean))
              1         2         3
 [1,] 0.4566667 0.5550000 0.3925000
 [2,] 0.6200000 0.5550000 0.3925000
 [3,] 0.4816667 0.5550000 0.3925000
 [4,] 0.4566667 0.5550000 0.6200000
 [5,] 0.4816667 0.5550000 0.4350000
 [6,] 0.4566667 0.5133333 0.6066667
 [7,] 0.4816667 0.0100000 0.4350000
 [8,] 0.4816667 0.5133333 0.3925000
 [9,] 0.4816667 0.5133333 0.6066667
[10,] 0.4816667 0.5550000 0.6066667

The entry to sapply, split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))) produces a list where each list item is a column of the matrix groupings. Each element of this list is fed to sapply which applies ave using the vector to group the results.

data

set.seed(1234)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
                   sample(c(1,2,3), 10, replace=TRUE),
                   sample(c(1,2,3,5), 10, replace=TRUE))
4
On
library(dplyr)
set.seed(1000)
var <- round(runif(10),digits=2)

groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
                     sample(c(1,2,3), 10, replace=TRUE),
                     sample(c(1,2,3,5), 10, replace=TRUE), var)

df = data.frame(groupings)
df %>% 
  group_by(V1)%>% mutate(x1 =mean(var))%>% ungroup(V1) %>% 
  group_by(V2) %>% mutate(x2=mean(var)) %>% ungroup(V2) %>% 
  group_by(V3) %>% mutate(x3=mean(var)) %>% ungroup(V3)

#      V1    V2    V3   var        x1    x2    x3
#   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl> <dbl>
#1      2     1     3  0.33 0.4775000 0.322 0.250
#2      3     3     1  0.76 0.6566667 0.470 0.750
#3      1     1     3  0.11 0.1333333 0.322 0.250
#4      3     1     5  0.69 0.6566667 0.322 0.635
#5      3     2     3  0.52 0.6566667 0.630 0.250
#6      1     3     3  0.07 0.1333333 0.470 0.250
#7      2     2     1  0.74 0.4775000 0.630 0.750
#8      2     3     5  0.58 0.4775000 0.470 0.635
#9      1     1     3  0.22 0.1333333 0.322 0.250
#10     2     1     2  0.26 0.4775000 0.322 0.260

# you can simply subset the columns