Apply mapply to each row of a dataframe

48 Views Asked by At

In this example I have a table DATASET with multiple rows containing data about company performance. For each row I want to join data about each possible department from another table called LOOKUPTABLE, which contains a string column and two numeric columns indicating value ranges. What complicates this joining, is that the columns of DATASET that should be used for this joining, differ for each department. Also, some columns aren't used for some departments at all. Also, the columns that should be used for joining data for a department, could change over time. That's why a mapping table COLUMN_MAP was created, which describes which columns should be used for mapping data for each department.

I wrote a functioning, but slow code. I tried speeding it up by using mapply. Also, I tried a second approach using outer() and vectorization, but it doesn't seem faster.

How could I speed up the following two working codes?

Solution 1 (using mapply)

LOOKUPTABLE <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "LOGISTICS", "LOGISTICS", "LOGISTICS", "VEHICLES", "VEHICLES", "ACCOUNTING", "ACCOUNTING"),
  RANGEFROM = c(1, 1, 4, 4, 1, 4, NA, NA),
  RANGETILL = c(4, 4, 9, 9, 4, 9, NA, NA),
  STRINGVALUE = c("A", "B", "A", "B", "", "", "A", "B"),
  RESULT = c(11, 12, 13, 14, 15, 16, 17, 18)
)

COLUMN_MAP <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "VEHICLES", "ACCOUNTING"),
  COLUMNNAME_NUMERIC = c("INTERNAL_num", "INTERNAL_num", NA),
  COLUMNNAME_STRING = c("INTERNAL_string", NA, "INTERNAL_string")
)

DATASET <- data.frame (
  INTERNAL_num = rep(1:8, 100),
  INTERNAL_string = rep(c("B", "A", "B", "B"), 200)
)

tic()

Function_result_join      <- function(LOOKUPTABLE_rows, COLUMN_MAP_rows, DATASET) {
  DATASET2<<-DATASET
  RESULT<-list()
  for( i in 1:nrow(DATASET2)) {
    a <<- LOOKUPTABLE_rows
    a <<- if(!is.na(COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])) a[!is.na(a$RANGEFROM) & a$RANGEFROM<=as.numeric(DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])]) & a$RANGETILL>as.numeric(DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_NUMERIC"])]),] else a
    a <<- if(!is.na(COLUMN_MAP_rows[1,"COLUMNNAME_STRING"])) a[!is.na(a$STRINGVALUE) & a$STRINGVALUE==DATASET2[i,which( colnames(DATASET2) == COLUMN_MAP_rows[1,"COLUMNNAME_STRING"])],] else a
    RESULT[i] <- ifelse((nrow(a)==0 || is.na(sum(a$RESULT))),1,a$RESULT)
  }
  return(as.numeric(RESULT))
}

LOOKUPTABLE_LIST <- split(LOOKUPTABLE,LOOKUPTABLE$DEPARTMENT_NAME)
COLUMN_MAP_LIST <- split(COLUMN_MAP,COLUMN_MAP$DEPARTMENT_NAME)

match_result <- as.data.frame(mapply(Function_result_join, LOOKUPTABLE_LIST, COLUMN_MAP_LIST, MoreArgs = list(DATASET=DATASET)))
final_result_multiplied <- apply(match_result,1,prod)
cbind(DATASET,final_result_multiplied)
toc()

Solution 2 (using outer() and vectorization)

LOOKUPTABLE <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "LOGISTICS", "LOGISTICS", "LOGISTICS", "VEHICLES", "VEHICLES", "ACCOUNTING", "ACCOUNTING"),
  RANGEFROM = c(1, 1, 4, 4, 1, 4, NA, NA),
  RANGETILL = c(4, 4, 9, 9, 4, 9, NA, NA),
  STRINGVALUE = c("A", "B", "A", "B", "", "", "A", "B"),
  RESULT = c(11, 12, 13, 14, 15, 16, 17, 18)
)

COLUMN_MAP <- data.frame (
  DEPARTMENT_NAME = c("LOGISTICS", "VEHICLES", "ACCOUNTING"),
  COLUMNNAME_NUMERIC = c("INTERNAL_num", "INTERNAL_num", NA),
  COLUMNNAME_STRING = c("INTERNAL_string", NA, "INTERNAL_string")
)

DATASET <- data.frame (
  INTERNAL_num = rep(1:8, 100),
  INTERNAL_string = rep(c("B", "A", "B", "B"), 200)
)

tic()
DATASET_stack <- stack(DATASET)
DATASET_stack$rowid <- rep(1:nrow(DATASET))

indextable <- data.frame(rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))
NUMERIC_columnnames <- data.frame(ind = rep(COLUMN_MAP$COLUMNNAME_NUMERIC,each=nrow(DATASET)), rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))
STRING_columnnames <- data.frame(ind = rep(COLUMN_MAP$COLUMNNAME_STRING,each=nrow(DATASET)), rowid=rep(1:nrow(DATASET)), DEPARTMENT_NAME=rep(COLUMN_MAP$DEPARTMENT_NAME,each=nrow(DATASET)), index=1:(nrow(COLUMN_MAP)*nrow(DATASET)))

NUMERIC_values <- merge(x = NUMERIC_columnnames, y = DATASET_stack, by = c("ind","rowid"), all.x = TRUE, sort=FALSE)
STRING_values <- merge(x = STRING_columnnames, y = DATASET_stack, by = c("ind","rowid"), all.x = TRUE, sort=FALSE)

NUMERIC_values<-NUMERIC_values[order(NUMERIC_values$index), ]
STRING_values<-STRING_values[order(STRING_values$index), ]

NUMERIC_values$required <- (!is.na(NUMERIC_values$ind))
STRING_values$required <- (!is.na(STRING_values$ind))

FunEqual <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable==lookupvalue)))
FunSmaller <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable<=lookupvalue)))
FunLarger <- function(lookuptable,lookupvalue) (ifelse(is.na(lookuptable) || lookuptable=="", TRUE, (lookuptable>lookupvalue)))
FunRequired <- function(lookuptable,lookupvalue) (lookupvalue)
FunResult <- function(lookuptable,lookupvalue) (lookuptable)

VecFunEqual <- Vectorize( FunEqual )
VecFunLarger <- Vectorize( FunLarger )
VecFunSmaller <- Vectorize( FunSmaller )
VecFunRequired <- Vectorize( FunRequired )
VecFunResult <- Vectorize( FunResult )


match_STRING_ind_required <- outer(LOOKUPTABLE$DEPARTMENT_NAME, STRING_values$required, VecFunRequired )
match_STRING_DEPARTMENT_NAME <- outer(LOOKUPTABLE$DEPARTMENT_NAME, STRING_values$DEPARTMENT_NAME, VecFunEqual )
match_STRING_value <- outer(LOOKUPTABLE$STRINGVALUE, STRING_values$values, VecFunEqual )

match_STRING_filter <- ifelse(!match_STRING_ind_required,TRUE,match_STRING_DEPARTMENT_NAME*match_STRING_value)

match_NUMERIC_ind_required <- outer(LOOKUPTABLE$DEPARTMENT_NAME, NUMERIC_values$required, VecFunRequired )
match_NUMERIC_DEPARTMENT_NAME <- outer(LOOKUPTABLE$DEPARTMENT_NAME, NUMERIC_values$DEPARTMENT_NAME, VecFunEqual )
match_NUMERIC_valuefrom <- outer(LOOKUPTABLE$RANGEFROM, NUMERIC_values$values, VecFunSmaller )
match_NUMERIC_valuetill <- outer(LOOKUPTABLE$RANGETILL, NUMERIC_values$values, VecFunLarger )

match_NUMERIC_filter <- ifelse(!match_NUMERIC_ind_required,TRUE,match_NUMERIC_DEPARTMENT_NAME*match_NUMERIC_valuefrom*match_NUMERIC_valuetill)

match_result <- outer(LOOKUPTABLE$RESULT, NUMERIC_values$rowid, VecFunResult) * match_STRING_filter * match_NUMERIC_filter
match_result_table <- data.frame(index = 1:(nrow(COLUMN_MAP)*nrow(DATASET)), RESULT = colSums(match_result))

final_result_long <- merge(x = indextable, y = match_result_table, by = c("index"), all.x = TRUE)
final_result_wide <- unstack(final_result_long, RESULT ~ DEPARTMENT_NAME)
final_result_multiplied <- apply(final_result_wide,1,prod)
cbind(DATASET,final_result_multiplied)
toc()
0

There are 0 best solutions below