Apply mapply to each row of a dataframe

Question

Apply mapply to each row of a dataframe

48 Views Asked by user3516915 At 13 February 2024 at 19:54

In this example I have a table DATASET with multiple rows containing data about company performance. For each row I want to join data about each possible department from another table called LOOKUPTABLE, which contains a string column and two numeric columns indicating value ranges. What complicates this joining, is that the columns of DATASET that should be used for this joining, differ for each department. Also, some columns aren't used for some departments at all. Also, the columns that should be used for joining data for a department, could change over time. That's why a mapping table COLUMN_MAP was created, which describes which columns should be used for mapping data for each department.

I wrote a functioning, but slow code. I tried speeding it up by using mapply. Also, I tried a second approach using outer() and vectorization, but it doesn't seem faster.

How could I speed up the following two working codes?

Solution 1 (using mapply)

LOOKUPTABLE <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "LOGISTICS", "LOGISTICS", "LOGISTICS", "VEHICLES", "VEHICLES", "ACCOUNTING", "ACCOUNTING"),
  RANGEFROM = c(1, 1, 4, 4, 1, 4, NA, NA),
  RANGETILL = c(4, 4, 9, 9, 4, 9, NA, NA),
  STRINGVALUE = c("A", "B", "A", "B", "", "", "A", "B"),
  RESULT = c(11, 12, 13, 14, 15, 16, 17, 18)
)

COLUMN_MAP <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "VEHICLES", "ACCOUNTING"),
  COLUMNNAME_NUMERIC = c("INTERNAL_num", "INTERNAL_num", NA),
  COLUMNNAME_STRING = c("INTERNAL_string", NA, "INTERNAL_string")
)

DATASET <- data.frame (
  INTERNAL_num = rep(1:8, 100),
  INTERNAL_string = rep(c("B", "A", "B", "B"), 200)
)

tic()

Function_result_join      <- function(LOOKUPTABLE_rows, COLUMN_MAP_rows, DATASET) {
  DATASET2<<-DATASET
  RESULT<-list()
  for( i in 1:nrow(DATASET2)) {
    a <<- LOOKUPTABLE_rows
    a <<- if(!is.na(COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])) a[!is.na(a$RANGEFROM) & a$RANGEFROM<=as.numeric(DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])]) & a$RANGETILL>as.numeric(DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])]),] else a
    a <<- if(!is.na(COLUMN_MAP_rows[1,"COLUMNNAME_STRING"])) a[!is.na(a$STRINGVALUE) & a$STRINGVALUE==DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_STRING"])],] else a
    RESULT[i] <- ifelse((nrow(a)==0 || is.na(sum(a$RESULT))),1,a$RESULT)
  }
  return(as.numeric(RESULT))
}

LOOKUPTABLE_LIST <- split(LOOKUPTABLE,LOOKUPTABLE$DEPARTMENT_NAME)
COLUMN_MAP_LIST <- split(COLUMN_MAP,COLUMN_MAP$DEPARTMENT_NAME)

match_result <- as.data.frame(mapply(Function_result_join, LOOKUPTABLE_LIST, COLUMN_MAP_LIST, MoreArgs = list(DATASET=DATASET)))
final_result_multiplied <- apply(match_result,1,prod)
cbind(DATASET,final_result_multiplied)
toc()

Solution 2 (using outer() and vectorization)

LOOKUPTABLE <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "LOGISTICS", "LOGISTICS", "LOGISTICS", "VEHICLES", "VEHICLES", "ACCOUNTING", "ACCOUNTING"),
  RANGEFROM = c(1, 1, 4, 4, 1, 4, NA, NA),
  RANGETILL = c(4, 4, 9, 9, 4, 9, NA, NA),
  STRINGVALUE = c("A", "B", "A", "B", "", "", "A", "B"),
  RESULT = c(11, 12, 13, 14, 15, 16, 17, 18)
)

COLUMN_MAP <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "VEHICLES", "ACCOUNTING"),
  COLUMNNAME_NUMERIC = c("INTERNAL_num", "INTERNAL_num", NA),
  COLUMNNAME_STRING = c("INTERNAL_string", NA, "INTERNAL_string")
)

DATASET <- data.frame (
  INTERNAL_num = rep(1:8, 100),
  INTERNAL_string = rep(c("B", "A", "B", "B"), 200)
)

tic()
DATASET_stack <- stack(DATASET)
DATASET_stack$rowid <- rep(1:nrow(DATASET))

indextable <- data.frame(rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))
NUMERIC_columnnames <- data.frame(ind = rep(COLUMN_MAP$COLUMNNAME_NUMERIC,each=nrow(DATASET)), rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))
STRING_columnnames <- data.frame(ind = rep(COLUMN_MAP$COLUMNNAME_STRING,each=nrow(DATASET)), rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))

NUMERIC_values <- merge(x = NUMERIC_columnnames, y = DATASET_stack, by = c("ind","rowid"), all.x = TRUE, sort=FALSE)
STRING_values <- merge(x = STRING_columnnames, y = DATASET_stack, by = c("ind","rowid"), all.x = TRUE, sort=FALSE)

NUMERIC_values<-NUMERIC_values[order(NUMERIC_values$index), ]
STRING_values<-STRING_values[order(STRING_values$index), ]

NUMERIC_values$required <- (!is.na(NUMERIC_values$ind))
STRING_values$required <- (!is.na(STRING_values$ind))

FunEqual <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable==lookupvalue)))
FunSmaller <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable<=lookupvalue)))
FunLarger <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable>lookupvalue)))
FunRequired <- function(lookuptable,lookupvalue) (lookupvalue)
FunResult <- function(lookuptable,lookupvalue) (lookuptable)

VecFunEqual <- Vectorize( FunEqual )
VecFunLarger <- Vectorize( FunLarger )
VecFunSmaller <- Vectorize( FunSmaller )
VecFunRequired <- Vectorize( FunRequired )
VecFunResult <- Vectorize( FunResult )


match_STRING_ind_required <- outer(LOOKUPTABLE$DEPARTMENT_NAME, STRING_values$required, VecFunRequired )
match_STRING_DEPARTMENT_NAME <- outer(LOOKUPTABLE$DEPARTMENT_NAME, STRING_values$DEPARTMENT_NAME, VecFunEqual )
match_STRING_value <- outer(LOOKUPTABLE$STRINGVALUE, STRING_values$values, VecFunEqual )

match_STRING_filter <- ifelse(!match_STRING_ind_required,TRUE,match_STRING_DEPARTMENT_NAME*match_STRING_value)

match_NUMERIC_ind_required <- outer(LOOKUPTABLE$DEPARTMENT_NAME, NUMERIC_values$required, VecFunRequired )
match_NUMERIC_DEPARTMENT_NAME <- outer(LOOKUPTABLE$DEPARTMENT_NAME, NUMERIC_values$DEPARTMENT_NAME, VecFunEqual )
match_NUMERIC_valuefrom <- outer(LOOKUPTABLE$RANGEFROM, NUMERIC_values$values, VecFunSmaller )
match_NUMERIC_valuetill <- outer(LOOKUPTABLE$RANGETILL, NUMERIC_values$values, VecFunLarger )

match_NUMERIC_filter <- ifelse(!match_NUMERIC_ind_required,TRUE,match_NUMERIC_DEPARTMENT_NAME*match_NUMERIC_valuefrom*match_NUMERIC_valuetill)

match_result <- outer(LOOKUPTABLE$RESULT, NUMERIC_values$rowid, VecFunResult) * match_STRING_filter * match_NUMERIC_filter
match_result_table <- data.frame(index = 1:(nrow(COLUMN_MAP)*nrow(DATASET)), RESULT = colSums(match_result))

final_result_long <- merge(x = indextable, y = match_result_table, by = c("index"), all.x = TRUE)
final_result_wide <- unstack(final_result_long, RESULT ~ DEPARTMENT_NAME)
final_result_multiplied <- apply(final_result_wide,1,prod)
cbind(DATASET,final_result_multiplied)
toc()

Original Q&A

Apply mapply to each row of a dataframe

There are 0 best solutions below

Related Questions in R

Related Questions in MAPPLY

Related Questions in NON-EQUI-JOIN

Trending Questions

Popular # Hahtags

Popular Questions