Mutate a new variable across multiple variables

132 Views Asked by At

I have this following data.frame. Each value in CP variables are in the same format -> hueX:Y-Z.

hueX is always the same in a row.

I would like to create another variable with value equal to

  • the value of CP_A if TCK_A!="Yes",
  • otherwise I would like the output to be equal to "hueX:mean(Y)-mean(Z)" across CP_B, CP_C and CP_D
data_trial = data.frame(hue=c(2,8,3,2,5),
          CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
          CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
          CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
          CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
          TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))

> data_trial
  hue              CP_A              CP_B              CP_C               CP_D TCK_A
1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                      Yes
2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987      
3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412   Yes
4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065   Yes
5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565   Yes


output

  hue              CP_A              CP_B              CP_C               CP_D TCK_A             output
1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                      Yes    hue2:6784-99988
2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987          hue8:7854-98743
3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412   Yes  hue3:60975-123418
4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065   Yes hue2:7654-48039787
5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565   Yes  hue5:45688-676554

What I have tried :


  data_trial %>% 
  separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
  separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
  separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
  separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
  separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
  separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
  mutate(Y_B=as.numeric(Y_B)) %>%
  mutate(Y_C=as.numeric(Y_C)) %>%
  mutate(Y_D=as.numeric(Y_D)) %>%
  mutate(Z_B=as.numeric(Z_B)) %>%
  mutate(Z_C=as.numeric(Z_C)) %>%
  mutate(Z_D=as.numeric(Z_D)) %>%
  rowwise %>%
  mutate(CP_output=ifelse(TCK_A=="Yes", paste0("hue", hue, ":", mean(across(c(Y_B, Y_C, Y_D)), na.rm=TRUE), "-", mean(across(c(Z_B,Z_C,Z_D)), na.rm=TRUE)), CP_A))

# A tibble: 5 × 13
# Rowwise: 
    hue CP_A              hueX_B   Y_B    Z_B hueX_C   Y_C     Z_C hueX_D   Y_D      Z_D TCK_A CP_output      
  <dbl> <chr>             <chr>  <dbl>  <dbl> <chr>  <dbl>   <dbl> <chr>  <dbl>    <dbl> <chr> <chr>          
1     2 hue2:6789-99987   "hue2"  6782  99987 hue2    6785   99989 ""        NA       NA "Yes" hue2:NA-NA     
2     8 hue8:7854-98743   "hue8"  7859  98734 hue8    6797   99980 "hue8"  6756    99987 ""    hue8:7854-98743
3     3 hue3:60987-123423 "hue3" 60989 123407 hue3   60995  123434 "hue3" 60942   123412 "Yes" hue3:NA-NA     
4     2 hue2:7658-873457  ""        NA     NA hue2    7657 8734509 "hue2"  7650 87345065 "Yes" hue2:NA-NA     
5     5 hue5:45658-676549 "hue5" 45697 676598 hue5   45667  676500 "hue5" 45699   676565 "Yes" hue5:NA-NA 
  

The first steps are working but might be simplified. The last step is not working. I have correct result only in the FALSE condition.

I still have difficulties to work accross multiple columns and repeat some identical operations on multiples columns. Any ideas to help me.

EDIT

I have also tried this following code and I get correct means but uncorrect hue value (the contrary compared to my first attempt)


data_trials_2 = data_trial %>% 
  separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
  separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
  separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
  separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
  separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
  separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
  mutate(Y_B=as.numeric(Y_B)) %>%
  mutate(Y_C=as.numeric(Y_C)) %>%
  mutate(Y_D=as.numeric(Y_D)) %>%
  mutate(Z_B=as.numeric(Z_B)) %>%
  mutate(Z_C=as.numeric(Z_C)) %>%
  mutate(Z_D=as.numeric(Z_D))

  data_trials_2$CP_output= paste0("hue", rowwise(data_trials_2[,1]), ":", round(rowMeans(data_trials_2[,c(4,7,10)], na.rm=TRUE)) , "-",    round(rowMeans(data_trials_2[,c(5,8,11)], na.rm=TRUE)))

data_trials_2$CP_output
[1] "huec(2, 8, 3, 2, 5):6783.5-99988"                     
[2] "huec(2, 8, 3, 2, 5):7137.33333333333-99567"           
[3] "huec(2, 8, 3, 2, 5):60975.3333333333-123417.666666667"
[4] "huec(2, 8, 3, 2, 5):7653.5-48039787"                  
[5] "huec(2, 8, 3, 2, 5):45687.6666666667-676554.333333333"
4

There are 4 best solutions below

6
On BEST ANSWER

separate from tidyr has been superseded, so I will use separate_wider_delim.

Once columns are separated, you end up with convenient column names to use tidy-select functions like contains, allowing you to mutate across in place of multiple mutate calls.

This example doesn't keep the original column data, but you can by setting cols_remove = FALSE in separate_wider_delim.

Edit Updated to replace rowwise operations with rowMeans.

library(dplyr)
library(tidyr)
data_trial %>%
  separate_wider_delim(starts_with('CP'), delim = ':', names = c('hue', 'values'),
                       names_sep = '_', too_few = 'align_end') %>%
  separate_wider_delim(ends_with('_values'), delim = '-', names = c('Y', 'Z'),
                       names_sep = '_', too_few = 'align_end') %>%
  mutate(across(contains('values'), as.numeric)) %>%
  mutate(mean_Y = rowMeans(across(matches("[^A]_values_Y")), na.rm = T),
         mean_Z = rowMeans(across(matches("[^A]_values_Z")), na.rm = T)) %>%
  mutate(output = if_else(TCK_A != 'Yes',
                          paste0('hue',hue,':',CP_A_values_Y,'-',CP_A_values_Z),
                          paste0('hue',hue,':', mean_Y, '-', mean_Z))) %>%
  glimpse()
#> Rows: 5
#> Columns: 17
#> $ hue           <dbl> 2, 8, 3, 2, 5
#> $ CP_A_hue      <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
#> $ CP_A_values_Y <dbl> 6789, 7854, 60987, 7658, 45658
#> $ CP_A_values_Z <dbl> 99987, 98743, 123423, 873457, 676549
#> $ CP_B_hue      <chr> "hue2", "hue8", "hue3", NA, "hue5"
#> $ CP_B_values_Y <dbl> 6782, 7859, 60989, NA, 45697
#> $ CP_B_values_Z <dbl> 99987, 98734, 123407, NA, 676598
#> $ CP_C_hue      <chr> "hue2", "hue8", "hue3", "hue2", "hue5"
#> $ CP_C_values_Y <dbl> 6785, 6797, 60995, 7657, 45667
#> $ CP_C_values_Z <dbl> 99989, 99980, 123434, 8734509, 676500
#> $ CP_D_hue      <chr> NA, "hue8", "hue3", "hue2", "hue5"
#> $ CP_D_values_Y <dbl> NA, 6756, 60942, 7650, 45699
#> $ CP_D_values_Z <dbl> NA, 99987, 123412, 87345065, 676565
#> $ TCK_A         <chr> "Yes", "", "Yes", "Yes", "Yes"
#> $ mean_Y        <dbl> 6783.500, 7137.333, 60975.333, 7653.500, 45687.667
#> $ mean_Z        <dbl> 99988.0, 99567.0, 123417.7, 48039787.0, 676554.3
#> $ output        <chr> "hue2:6783.5-99988", "hue8:7854-98743", "hue3:60975.3333…

Data

data_trial <- data.frame(hue=c(2,8,3,2,5),
                        CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
                        CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
                        CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
                        CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
                        TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))
0
On

Here's another, relatively concise way, which uses pmap(), a short function and glue:

f <- \(x, y) str_extract(x, y) |> as.numeric() |> mean(na.rm = T) |> round()
data_trial |> mutate(output = ifelse(TCK_A != "Yes", CP_A, pmap(across(CP_B:CP_D), ~ glue::glue("{str_extract(c(...), '.*:')[1]}{f(c(...), '(?<=:)(\\\\d+)(?=-)')}-{f(c(...), '(?<=-)(\\\\d+)')}"))))

Output:

  hue              CP_A              CP_B              CP_C               CP_D
1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                   
2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987
3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412
4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065
5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565
  TCK_A            output
1   Yes   hue2:6784-99988
2         hue8:7854-98743
3   Yes hue3:60975-123418
4   Yes   NA7654-48039787
5   Yes hue5:45688-676554
2
On

otherwise I would like the output to be equal to "hueX:mean(Y)-mean(Z)" across CP_B, CP_C and CP_D

Here a first version of an apporach in base R. It seems like I have correctly anticipated that you want row-wise means.

fst = round(colMeans(
  apply(data_trial[, c("CP_B", "CP_C", "CP_D")], 1L,
        \(x) as.numeric(gsub(".*hue[0-9]\\:(.+)\\-.*", "\\1", x))), 
  na.rm = TRUE), 0L)
snd = round(colMeans(
  apply(data_trial[, c("CP_B", "CP_C", "CP_D")], 1L,
        \(x) as.numeric(gsub(".*\\-", "\\1", x))), 
  na.rm = TRUE), 0L)

data_trial$another_var = 
  ifelse(data_trial$TCK_A != "Yes", 
         data_trial$CP_A,
         paste0(gsub("\\:.*", "\\1", data_trial$CP_A), ":", fst, "-", snd))

data_trial
#>   hue              CP_A              CP_B              CP_C               CP_D
#> 1   2   hue2:6789-99987   hue2:6782-99987   hue2:6785-99989                   
#> 2   8   hue8:7854-98743   hue8:7859-98734   hue8:6797-99980    hue8:6756-99987
#> 3   3 hue3:60987-123423 hue3:60989-123407 hue3:60995-123434  hue3:60942-123412
#> 4   2  hue2:7658-873457                   hue2:7657-8734509 hue2:7650-87345065
#> 5   5 hue5:45658-676549 hue5:45697-676598 hue5:45667-676500  hue5:45699-676565
#>   TCK_A        another_var
#> 1   Yes    hue2:6784-99988
#> 2          hue8:7854-98743
#> 3   Yes  hue3:60975-123418
#> 4   Yes hue2:7654-48039787
#> 5   Yes  hue5:45688-676554

Created on 2023-12-07 with reprex v2.0.2

Data

data_trial = data.frame(hue=c(2,8,3,2,5),
                        CP_A=c("hue2:6789-99987", "hue8:7854-98743","hue3:60987-123423","hue2:7658-873457","hue5:45658-676549"),
                        CP_B=c("hue2:6782-99987", "hue8:7859-98734","hue3:60989-123407","","hue5:45697-676598"),
                        CP_C=c("hue2:6785-99989", "hue8:6797-99980","hue3:60995-123434","hue2:7657-8734509","hue5:45667-676500"),
                        CP_D=c("", "hue8:6756-99987","hue3:60942-123412","hue2:7650-87345065","hue5:45699-676565"),
                        TCK_A=c("Yes", "", "Yes", "Yes", "Yes"))

2
On

Edited. Comment on the code you have. I keep 3 decimal places, assuming you don't want them endlessly long, but you can change as you wish.

data_trial %>% 
  separate(CP_B, into=c("hueX_B","Y_BxZ_B"), sep=":") %>%
  separate(CP_C, into=c("hueX_C","Y_CxZ_C"), sep=":") %>%
  separate(CP_D, into=c("hueX_D","Y_DxZ_D"), sep=":") %>%
  separate(Y_BxZ_B, into=c("Y_B", "Z_B"), sep="-") %>%
  separate(Y_CxZ_C, into=c("Y_C", "Z_C"), sep="-") %>%
  separate(Y_DxZ_D, into=c("Y_D", "Z_D"), sep="-") %>%
  mutate(Y_B=as.numeric(Y_B)) %>%
  mutate(Y_C=as.numeric(Y_C)) %>%
  mutate(Y_D=as.numeric(Y_D)) %>%
  mutate(Z_B=as.numeric(Z_B)) %>%
  mutate(Z_C=as.numeric(Z_C)) %>%
  mutate(Z_D=as.numeric(Z_D)) %>%
  rowwise %>%
  mutate(CP_output=ifelse(TCK_A=="Yes", paste0("hue", hue, ":", mean(c(Y_B, Y_C, Y_D),na.rm=TRUE)%>%round(3), "-", mean(c(Z_B,Z_C,Z_D), na.rm=TRUE))%>%round(3) , CP_A%>%as.character()))
#it gives
# A tibble: 5 x 13
# Rowwise: 
    hue CP_A              hueX_B   Y_B    Z_B hueX_C   Y_C     Z_C hueX_D   Y_D      Z_D TCK_A CP_output                
  <dbl> <fct>             <chr>  <dbl>  <dbl> <chr>  <dbl>   <dbl> <chr>  <dbl>    <dbl> <fct> <chr>                    
1     2 hue2:6789-99987   "hue2"  6782  99987 hue2    6785   99989 ""        NA       NA "Yes" hue2:6783.5-99988        
2     8 hue8:7854-98743   "hue8"  7859  98734 hue8    6797   99980 "hue8"  6756    99987 ""    hue8:7854-98743          
3     3 hue3:60987-123423 "hue3" 60989 123407 hue3   60995  123434 "hue3" 60942   123412 "Yes" hue3:60975.333-123417.667
4     2 hue2:7658-873457  ""        NA     NA hue2    7657 8734509 "hue2"  7650 87345065 "Yes" hue2:7653.5-48039787     
5     5 hue5:45658-676549 "hue5" 45697 676598 hue5   45667  676500 "hue5" 45699   676565 "Yes" hue5:45687.667-676554.333

Is this what you are trying to achieve? The key issue you got is the incorrect use of across(), and potentially CP_A being character. Just use mean() function to get the mean from multiple column is fine.