I would like someone to suggest a faster way complete the following steps:
- Using the variables in
orig_df, generate unique combination of variables. - Multiply the unique combination of variables from step 1 and
cbindthe new variables toorig_df.
My real data frame contains anywhere from 1,000 to 5,000 variables (columns) so my current code on my real world data frames is quite slow as the combination of variables grows significantly as the number of original variables increases. For the sake of simplicity, I have included a reprex data frame with 24 variables.
library(tidyverse)
library(data.table)
## reprex data
orig_df <- structure(list(name1 = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a"), name2 = c("b", "b", "b", "b", "b", "b", "b", "b",
"b", "b"), a1 = c(2.02914670105014, 3.4823452794778, 2.10650327936948,
2.60655830350524, 1.96007832755073, 2.20319900226866, 2.2695046408281,
2.21424994202857, 2.28331831552798, 2.7281186408642), a2 = c(2.34996652741376,
3.70095020360926, 2.23917726097417, 1.78633957970163, 2.2069085425937,
2.50808324747805, 2.10149739588418, 2.40052085287649, 2.15312734529292,
2.26983254343561), a3 = c(2.29236552982738, 3.89205794602908,
2.00332958161127, 2.24914520112217, 2.53007733770606, 2.71754551346492,
2.0114333932435, 2.86503488517146, 2.24698418468691, 2.01899695076691
), a4 = c(3.41316237098706, 4.74677617124502, 2.16724250299177,
2.69085298208423, 2.92152132542834, 2.77166460573895, 2.11240604408321,
3.69400496418272, 2.60149397515834, 2.55143073028433), a5 = c(181,
153, 157, 164, 186, 160, 189, 184, 156, 144), a6 = c(109.505743217992,
124.993679689496, 118.331551171713, 121.149943117191, 117.845938184937,
116.653468860762, 115.497212844565, 119.598619239394, 118.71099689067,
119.980353824938), a7 = c(97.9342963550266, 101.648325143684,
95.6506939872352, 99.3679362781565, 96.8558088142306, 97.807547332176,
95.8501286576493, 97.0383412710708, 97.2285616820599, 96.4837667978641
), a8 = c(82.9441585205545, 141.582901736505, 141.917096237054,
133.01571492792, 139.3097949682, 159.139671804723, 135.367408753504,
146.93593535641, 142.989812366132, 153.648429482203), a9 = c(207.440039573019,
226.64200483318, 213.982245158948, 220.517879395348, 214.701746999167,
214.461016192938, 211.347341502214, 216.636960510465, 215.939558572729,
216.464120622803), a10 = c(42, 52.475, 52.475, 49.475, 52.475,
46.475, 52.475, 49.475, 49.475, 52.475), a11 = c(86.47, 98.01,
93.23, 97.06, 94.23, 89.82, 92.05, 92.34, 97.02, 94.53), a12 = c(161.545,
179.1775, 166.61, 170.9975, 166.35, 164.6125, 161.65, 171.7475,
169.29, 170.7975), a13 = c(299.982, 317.99, 300.538, 311.906,
305.2085, 306.263, 299.855, 305.34, 304.7145, 304.223), a14 = c(44.09,
88.66, 57.84, 78.75, 73.51, 64.51, 88.59, 74.5, 77.84, 84.08),
a15 = c(79.08097, 90.1388, 85.24364, 84.78847, 84.34771,
84.42352, 83.96158, 85.82465, 83.3913, 86.80556), a16 = c(26.6,
38.35, 35.075, 34.075, 34.7125, 30.075, 35.35, 32.7125, 33.7125,
37.35), a17 = c(61.3, 73.15, 70.45, 69.45, 69, 68.45, 70.15,
68, 68, 72.15), a18 = c(120.55, 130.45, 121.25, 123, 122.5,
122, 122.45, 124.5, 122.5, 126.45), a19 = c(156.35, 166.7,
157.4, 157.4, 158, 158.4, 156.7, 159, 158, 157.7), a20 = c(211,
225.35, 212.425, 214.425, 218.15, 216.425, 211, 220.15, 215.15,
214), a21 = c(292.221, 304.44, 284.18, 288.88, 293.07, 289.45,
284.98, 297.26, 289.5, 289.45), a22 = c(113.40921, 128.19039,
120.84558, 120.76845, 119.74005, 120.69014, 120.43093, 121.66397,
118.77983, 123.26633)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
rankBy <- names(orig_df)[-c(1:2)]
## STEP 1:
## variable combination step (need to use "rankBy" vector because real df contains variables that are not used in the combination step)
colsToMultiply <- orig_df %>%
dplyr::select(all_of(rankBy)) %>%
names() %>%
data.frame()
originalCols <- colsToMultiply ## vector of original column names
colsToMultiply <- expand.grid(colsToMultiply$., colsToMultiply$.)
colsToMultiply <- colsToMultiply %>%
dplyr::mutate_all(~as.character(.))
colsToMultiply$dups <- apply(colsToMultiply[,1:ncol(colsToMultiply)], 1, function(i) any(duplicated(i, incomparables = NA)))
colsToMultiply <- colsToMultiply[!colsToMultiply$dups, ]
colsToMultiply <- colsToMultiply[,-c(ncol(colsToMultiply))]
colsToMultiply <- colsToMultiply[!duplicated(t(apply(colsToMultiply[,c(1:ncol(colsToMultiply))], 1, sort))), ] # remove: a,b ; b,a duplicates row wise
## split cols into chunks (easier on memory)
chunks <- 100
n <- nrow(colsToMultiply)
r <- rep(1:ceiling(n/chunks),each=chunks)[1:n]
colgridRunList <- split(colsToMultiply, r)
rm(list = setdiff(ls(), c("orig_df","colgridRunList","rankBy")))
gc()
## STEP 2:
## here we ADD or MULTIPLY two variables (columns) and cbind the new variable to original dataframe
orig_df <- setDT(orig_df)
newVarsList <- vector("list", length = length(colgridRunList))
for (x in seq_along(colgridRunList)) {
colgrid <- colgridRunList[[x]]
colList <- vector("list", length = nrow(colgrid))
for (i in 1:nrow(colgrid)) {
colselect1 <- colgrid[i,1]
colselect2 <- colgrid[i,2]
colname <- paste0(colselect1,"*",colselect2)
new_var <- orig_df[,..colselect1] * orig_df[,..colselect2]
colnames(new_var)[1] <- colname
colList[[i]] <- new_var
}
colList <- do.call(cbind, colList)
newVarsList[[x]] <- colList
}
newVarsList <- do.call(cbind,newVarsList)
orig_df <- cbind(orig_df, newVarsList)
rm(list = setdiff(ls(), c("orig_df")))
gc()
Instead of creating extra combinations and then removing those and its mirror copies, it may be easier with
combn-testing
Benchmarks
On the example data itself, it is faster with
combn