Extract and plot n rows before and after the onset of each event, centered on zero

122 Views Asked by At

Objective: From a time-series df, make a plot of each occurrence of a particular state (or factor level) with x timepoints before, and y timepoints after, the onset (i.e. first row) of that state. The graph should be centered on zero (on the x-axis), such that the x timepoints before the event are negative values, and the y timepoints after the event are positive values. This is the same principal as a peristimulus time histogram.

The data: I have time-series data where different states can occur for variable amounts of time. First I use run length encoding (rle) to determine the start and stop of each run of each state (not shown). Second, I use a function, similar to the one described here, to return, say one row above and two rows below the onset of a particular state (state "A" in the example below). Here’s what that data looks like.

df <- data.frame(
  state =      c("A","A","A","A","A","B","A","A","X","Y","Z","A","A","A","B","A","A"),
  start =      c("start","NA","NA","NA","NA","NA","start","NA","NA","NA","NA","start","NA","NA","NA","start","NA"),
  rleGroup =   c("1","1","1","1","1","2","3","3","4","5","6","7","7","7","8","9","9"),
  data = runif(17)
)
df <- df %>% tidyr::unite(stateStart, c(state,start), sep = ".", remove = FALSE)

   stateStart state start rleGroup       data
1     A.start     A start        1 0.85118187
2        A.NA     A    NA        1 0.23502147
3        A.NA     A    NA        1 0.97435662
4        A.NA     A    NA        1 0.45669042
5        A.NA     A    NA        1 0.48271803
6        B.NA     B    NA        2 0.80561653
7     A.start     A start        3 0.27228361
8        A.NA     A    NA        3 0.07008506
9        X.NA     X    NA        4 0.44101076
10       Y.NA     Y    NA        5 0.95173954
11       Z.NA     Z    NA        6 0.65693316
12    A.start     A start        7 0.45831802
13       A.NA     A    NA        7 0.83629347
14       A.NA     A    NA        7 0.62107270
15       B.NA     B    NA        8 0.53294588
16    A.start     A start        9 0.08533221
17       A.NA     A    NA        9 0.28805362

extract.with.context <- function(x, colname, rows, after = 0, before = 0) {
  match.idx  <- which(x[[colname]] %in% rows)
  span       <- seq(from = -before, to = after)
  extend.idx <- c(outer(match.idx, span, `+`))
  extend.idx <- Filter(function(i) i > 0 & i <= nrow(x), extend.idx)
  extend.idx <- sort(unique(extend.idx))
  return(x[extend.idx, , drop = FALSE]) 
}
extracted = extract.with.context(x=df, colname="stateStart", rows=c("A.start"), after = 2, before = 1)

   stateStart state start rleGroup       data
1     A.start     A start        1 0.85118187
2        A.NA     A    NA        1 0.23502147
3        A.NA     A    NA        1 0.97435662
6        B.NA     B    NA        2 0.80561653
7     A.start     A start        3 0.27228361
8        A.NA     A    NA        3 0.07008506
9        X.NA     X    NA        4 0.44101076
11       Z.NA     Z    NA        6 0.65693316
12    A.start     A start        7 0.45831802
13       A.NA     A    NA        7 0.83629347
14       A.NA     A    NA        7 0.62107270
15       B.NA     B    NA        8 0.53294588
16    A.start     A start        9 0.08533221
17       A.NA     A    NA        9 0.28805362

The problem: I want to plot each extracted run of state A (i.e., one timepoint before and two timepoints after A.start). One thought is to make a unique identifier for each run grouping of state "A" (Question 1), and then make a time sequence counter that reflects the desired rows before and after the onset of state A (Question 2).

QUESTION NUMBER 1. Create a unique identifier for each "plotGroup", defined as each run of state A (i.e., a row before A.start and two rows after A.start) I tried this, but it's not quite working

extracted %>% mutate(plotGroup = cumsum(lag(state) == "A" & state != "A"))

it should look like this

extracted$plotGroup <- c("0","0","0","1","1","1","1","2","2","2","2","3","3","3")

   stateStart state start rleGroup       data plotGroup
1     A.start     A start        1 0.85118187         0
2        A.NA     A    NA        1 0.23502147         0
3        A.NA     A    NA        1 0.97435662         0
6        B.NA     B    NA        2 0.80561653         1
7     A.start     A start        3 0.27228361         1
8        A.NA     A    NA        3 0.07008506         1
9        X.NA     X    NA        4 0.44101076         1
11       Z.NA     Z    NA        6 0.65693316         2
12    A.start     A start        7 0.45831802         2
13       A.NA     A    NA        7 0.83629347         2
14       A.NA     A    NA        7 0.62107270         2
15       B.NA     B    NA        8 0.53294588         3
16    A.start     A start        9 0.08533221         3
17       A.NA     A    NA        9 0.28805362         3

QUESTION NUMBER 2. Create a "counter," centered on zero, of one row above and two rows after the A.start This I have no idea how to do! But presumably can make use of "span" in the function This is my desired output

extracted$span <- c("0","1","2","-1","0","1","2","-1","0","1","2","-1","0","1")

   stateStart state start rleGroup       data plotGroup span
1     A.start     A start        1 0.85118187         0    0
2        A.NA     A    NA        1 0.23502147         0    1
3        A.NA     A    NA        1 0.97435662         0    2
6        B.NA     B    NA        2 0.80561653         1   -1
7     A.start     A start        3 0.27228361         1    0
8        A.NA     A    NA        3 0.07008506         1    1
9        X.NA     X    NA        4 0.44101076         1    2
11       Z.NA     Z    NA        6 0.65693316         2   -1
12    A.start     A start        7 0.45831802         2    0
13       A.NA     A    NA        7 0.83629347         2    1
14       A.NA     A    NA        7 0.62107270         2    2
15       B.NA     B    NA        8 0.53294588         3   -1
16    A.start     A start        9 0.08533221         3    0
17       A.NA     A    NA        9 0.28805362         3    1

Ultimate objective: plot data by span for each individual plotgroup

ggplot(data=extracted, aes(x=span, y = data, group = plotGroup)) + geom_line()

enter image description here

2

There are 2 best solutions below

6
Jon Spring On

This approach subdivides the data into plotGroups where each group starts one step before each new A (except for the first grp), and the counter is set at zero for each group's first A. The division point prior is determined by the n in lead(), and we could add a filter to limit the points after.

# edit to fix first group counting
df %>%
  mutate(start = state == "A" & lag(state, default = "") != "A") %>%
  mutate(plotGroup = cumsum(lead(start, n = 1, default = FALSE))) %>%
  group_by(plotGroup) %>%
  mutate(counter = row_number() - row_number()[start]) %>%
  ungroup() %>%
  filter(counter <= 2) %>%
  ggplot(aes(counter, data, group = plotGroup)) +
  geom_line()

Result before plotting:

# A tibble: 14 × 6
   state start rleGroup   data plotGroup counter
   <chr> <lgl> <chr>     <dbl>     <int>   <int>
 1 A     TRUE  1        0.0198         0       0
 2 A     FALSE 1        0.338          0       1
 3 A     FALSE 1        0.635          0       2
 4 B     FALSE 2        0.0138         1      -1
 5 A     TRUE  3        0.218          1       0
 6 A     FALSE 3        0.208          1       1
 7 X     FALSE 4        0.0934         1       2
 8 Z     FALSE 6        0.499          2      -1
 9 A     TRUE  7        0.0417         2       0
10 A     FALSE 7        0.934          2       1
11 A     FALSE 7        0.507          2       2
12 B     FALSE 8        0.555          3      -1
13 A     TRUE  9        0.158          3       0
14 A     FALSE 9        0.437          3       1
1
redatoms On
#Define number of rows you want before and after the zero-centered graph
after <- 2
before <- 1

#made up data 
df <- data.frame(
  state =      c("A","A","A","A","A","B","A","A","X","Y","Z","A","A","A","B","A","A"),
  start =      c("start","NA","NA","NA","NA","NA","start","NA","NA","NA","NA","start","NA","NA","NA","start","NA"),
  rleGroup =   c("1","1","1","1","1","2","3","3","4","5","6","7","7","7","8","9","9"),
  data = runif(17)
)
df <- df %>% tidyr::unite(stateStart, c(state,start), sep = ".", remove = FALSE)

#extract the rows before and after the onset of a particular state
extract.with.context <- function(x, colname, rows, after = 0, before = 0) {
  match.idx  <- which(x[[colname]] %in% rows)
  span       <- seq(from = -before, to = after)
  extend.idx <- c(outer(match.idx, span, `+`))
  extend.idx <- Filter(function(i) i > 0 & i <= nrow(x), extend.idx)
  extend.idx <- sort(unique(extend.idx))
  return(x[extend.idx, , drop = FALSE]) 
}

extracted.df = extract.with.context(x=df, colname="stateStart", rows=c("A.start"), after = after, before = before)

# Create plotGroup
# if we go off starting cue = T/F, and start counting when lead (by "before") is T, 
# then we should get correct plotGroup, regardless whether the desired state is in first row or not 
boo <- extracted.df %>%
  dplyr::mutate(start2 = state == "A" & lag(state, default = "") != "A") %>%
  mutate(plotGroup = cumsum(lead(start2, n = before, default = FALSE)))

#create the counter/sequence to zero the graph 
counter <- rep(NA, times = length(boo$start)) # make an empty counter
starts <- which(boo$start == "start") # find the start positions
counter[starts] <- 0

for(i in 1:after){ # for every position after a start, up to "after"
  indexes <- starts + i # index of positions "i" after the start
  indexes_1 <- indexes[which(indexes %in% 1:length(counter))] # indexes can run over the length of the counter - we only want indexes that are within the length of the counter
  counter[indexes_1] <- i # for those indexes, put in the count, i
}
for(i in 1:before){ # same as for "after", but in reverse for "before"
  indexes <- starts - i
  indexes_1 <- indexes[which(indexes %in% 1:length(counter))]
  counter[indexes_1] <- -i
}

boo$span <- counter
boo
   stateStart state start rleGroup       data start2 plotGroup span
1     A.start     A start        1 0.22771277   TRUE         0    0
2        A.NA     A    NA        1 0.39769158  FALSE         0    1
3        A.NA     A    NA        1 0.42416120  FALSE         0    2
6        B.NA     B    NA        2 0.06402964  FALSE         1   -1
7     A.start     A start        3 0.22233942   TRUE         1    0
8        A.NA     A    NA        3 0.77667057  FALSE         1    1
9        X.NA     X    NA        4 0.36675437  FALSE         1    2
11       Z.NA     Z    NA        6 0.49100719  FALSE         2   -1
12    A.start     A start        7 0.26012695   TRUE         2    0
13       A.NA     A    NA        7 0.88900224  FALSE         2    1
14       A.NA     A    NA        7 0.59714172  FALSE         2    2
15       B.NA     B    NA        8 0.15040234  FALSE         3   -1
16    A.start     A start        9 0.85581300   TRUE         3    0
17       A.NA     A    NA        9 0.15780435  FALSE         3    1

# plot 
ggplot(data=boo, aes(x=span, y = data, group = plotGroup)) + 
  geom_line()