Add slope and cumsum from last 10 rows as column to table

178 Views Asked by At

I have a large table with timestamps from several nights. Columns are an id for what night, an id for what timestamp within that night and the hearth rate at that timestamp, it looks like this:

allData <- data.table(nightNo=c(1,1,1,1,1,1,2,2,2,2), withinNightNo=c(1,2,3,4,5,6,1,2,3,4), HR=c(1:10))

nightNo withinNightNo HR
   1             1     1
   1             2     2
   1             3     3
   1             4     4
   1             5     5
   1             6     6
   2             1     7
   2             2     8
   2             3     9
   2             4    10

I'd like to add two new columns to the table, the slope and the cumsum of HR from up to the last 10 rows of the same night. I calculate the slope using linear regression and defined cumsum as: CUMSUMn = MAX(CUMSUMn-1, 0) + (valuen - MEAN(value1-n)). The result should look like this:

nightNo withinNightNo  HR HRSlope HRCumsum
    1             1     1     NaN      0.0
    1             2     2       1      0.5
    1             3     3       1      1.5
    1             4     4       1      3.0
    1             5     5       1      5.0
    1             6     6       1      7.5
    2             1     7     NaN      0.0
    2             2     8       1      0.5
    2             3     9       1      1.5
    2             4    10       1      3.0

I've created code for both of these functions using for loops. They work, but my table is so large that it takes a long time to even calculate the slope/cumsum of a single value. My code looks like this:

# Add HRSlope column
allData$HRSlope <- 0

for(i in 1:nrow(allData)){
    # Get points from up to last 10 seconds of the same night
    start <- ifelse(i < 11, 1, (i-10))
    points <- filter(allData[start:i,], nightNo == allData[i,]$nightNo)[, c("withinNightNo", "HR")]

    # Calculate necessary values
    meanX <- mean(points$withinNightNo)
    meanY <- mean(points$HR)
    meanXY <- mean(points$withinNightNo * points$HR)
    meanX2 <- mean(points$withinNightNo^2)

    # Calculate slope and add to table
    allData[i,]$HRSlope <- (meanX * meanY - meanXY) / (meanX^2 - meanX2)

    cat(i, "\n")
}

# Add cumsum column, and add first value to sum
allData$HRCumsum <- 0
Sum <- allData[1,]$HR

for(i in 2:nrow(allData)){
  # Get sum and average of HR in night so far, reset Sum if new night started
  Sum <- allData[i,]$HR + ifelse(allData[i,]$nightNo != allData[i-1,]$nightNo, 0 , Sum )
  Average <- Sum / allData[i,]$withinNightNo

  # Get previous cumsum, if available
  pCumsum <- ifelse(allData[i,]$nightNo != allData[i-1,]$nightNo, 0 , allData[i-1,]$HRCumsum )

  # Calculate current cumsum
  allData[i,]$HRCumsum <- max(pCumsum, 0) + (allData[i,]$HR - Average)

  cat(i, "\n")
}

Is there a more efficient way to do this, presumably without for loops?

EDIT:

I've been able to increase the speed of my slope function somewhat. It however still uses a forloop and it actually puts down a wrong value in a field for 9 times before putting down the correct value. Any thoughts on how to fix these two issues?

getSlope <- function(x, y) {
    # Calculate necessary values
    meanX <- mean(x)
    meanY <- mean(y)
    meanXY <- mean(x * y)
    meanX2 <- mean(x^2)

    # Calculate slope
    return((meanX * meanY - meanXY) / (meanX^2 - meanX2))
}

# Loop back to 1
for(i in max(allData):1){
    # Prevent i<=0
    low <- ifelse(i < 10, 0, i-10)

    # Grab up to last 10 points and calculate slope
    allData[with(allData, withinNightNo > i-10 & withinNightNo <= i), slope := getSlope(withinNightNo, HR), by= nightNo]
}

EDIT2:

I've also been able to improve my cumsum a little, but it suffers from the same things as the slope. Besides that it takes larger chuncks of the table, because it needs to get the average, and needs to loop over all the data twice. Any thoughts on improving this would also be highly be appreciated.

# Calculate part of the cumsum
getCumsumPart <- function(x){
    return(x-mean(x))
}

# Calculate valueN - mean(value1:N)
for(i in max(allData$withinNightNo):1){
   allData[with(allData, withinNightNo <= i), cumsumPart:= 
   getCumsumPart(HR), by=nightNo]
}

# Calculate  + max(cumsumN-1, 0)
for(i in max(allData$withinNightNo):1){
    allData[with(allData, withinNightNo <= i & cumsumPart > 0), cumsum:= sum(cumsumPart), by=nightNo]
}

# Remove part table
allData$cumsumPart <- NULL

# Set NA values to 0
allData[with(allData, is.na(cumsum)), cumsum := 0]
1

There are 1 best solutions below

0
On BEST ANSWER

Try this approach

library(dplyr)
library(caTools)

allData <- data.frame(nightNo=c(1,1,1,1,1,1,2,2,2,2), 
                      withinNightNo=c(1,2,3,4,5,6,1,2,3,4), 
                      HR=c(1:10))

group_fun <- function(grouped_df, window=10L) {
  # slope
  mean_x <- runmean(grouped_df$withinNightNo, window, align="right")
  mean_y <- runmean(grouped_df$HR, window, align="right")
  mean_xy <- runmean(grouped_df$withinNightNo * grouped_df$HR, window, align="right")
  mean_xx <- runmean(grouped_df$withinNightNo * grouped_df$withinNightNo, window, align="right")
  grouped_df$slope <- (mean_x * mean_y - mean_xy) / (mean_x^2 - mean_xx)

  # cumsum
  partial <- grouped_df$HR - mean_y # from above
  # the "loop" is unavoidable here, I think
  cumsum <- 0
  grouped_df$cumsum <- sapply(partial, function(val) {
    cumsum <<- max(cumsum, 0) + val
    cumsum
  })

  grouped_df
}

out <- allData %>%
  group_by(nightNo) %>%
  do(group_fun(., window=3L)) # change window as desired