reindex and padding R dataframe

1k Views Asked by At

I have an R dataframe that looks like this

1  A   1
2  A   0.9
5  A   0.7
6  A   0.6
8  A   0.5
3  B   0.6
4  B   0.5
5  B   0.4
6  B   0.3

I'd need to fill all the gaps till the maximum per category (second column). i.e. the result I wish to obtain is the following

1  A  1
2  A  0.9
3  A  0.9
4  A  0.9
5  A  0.7
6  A  0.6
7  A  0.6
8  A  0.5
1  B  0.6
2  B  0.6
3  B  0.6
4  B  0.5
5  B  0.4
6  B  0.3

basically, padding backwards when there are missing data before the first obs and forward when missing data is in between. what I did is grouping by cat

 groupby = ddply(df, ~fit$group,summarise, max=max(time))

A  8
B  6

but now I'm stuck on the next steps.

2

There are 2 best solutions below

3
On BEST ANSWER

We can try with data.table/zoo. Convert the 'data.frame' to 'data.table' (setDT(df1)), expand the 'v1' column based on the sequence of max value grouped by 'v2', join on with 'v1' and 'v2' and then grouped by 'v2', we pad the NA elements with adjacent elements using na.locf (from zoo)

library(data.table)
library(zoo)
setDT(df1)[df1[, .(v1=seq_len(max(v1))), v2], on = c('v1', 'v2')
  ][, v3 := na.locf(na.locf(v3, na.rm = FALSE), fromLast=TRUE), by = v2][]
#     v1 v2  v3
# 1:  1  A 1.0
# 2:  2  A 0.9
# 3:  3  A 0.9
# 4:  4  A 0.9
# 5:  5  A 0.7
# 6:  6  A 0.6
# 7:  7  A 0.6
# 8:  8  A 0.5
# 9:  1  B 0.6
#10:  2  B 0.6
#11:  3  B 0.6
#12:  4  B 0.5
#13:  5  B 0.4
#14:  6  B 0.3

Or using dplyr/zoo

library(dplyr)
library(zoo)
library(tidyr)
df1 %>%
    group_by(v2) %>% 
    expand(v1 = seq_len(max(v1))) %>%
    left_join(., df1) %>%
    mutate(v3 =  na.locf(na.locf(v3, na.rm = FALSE), fromLast=TRUE)) %>%
    select(v1, v2, v3)
#    v1    v2    v3
#   <int> <chr> <dbl>
#1      1     A   1.0
#2      2     A   0.9
#3      3     A   0.9
#4      4     A   0.9
#5      5     A   0.7
#6      6     A   0.6
#7      7     A   0.6
#8      8     A   0.5
#9      1     B   0.6
#10     2     B   0.6
#11     3     B   0.6
#12     4     B   0.5
#13     5     B   0.4
#14     6     B   0.3

data

df1 <- structure(list(v1 = c(1L, 2L, 5L, 6L, 8L, 3L, 4L, 5L, 6L), v2 = c("A", 
"A", "A", "A", "A", "B", "B", "B", "B"), v3 = c(1, 0.9, 0.7, 
0.6, 0.5, 0.6, 0.5, 0.4, 0.3)), .Names = c("v1", "v2", "v3"),
 class = "data.frame", row.names = c(NA, -9L))
0
On
library(dplyr)
library(tidyr)
library(zoo)
complete(dat, V2, V1) %>% mutate(V3 = na.locf(V3))

results in:

# A tibble: 14 × 3
       V2    V1    V3
   <fctr> <int> <dbl>
1       A     1   1.0
2       A     2   0.9
3       A     3   0.9
4       A     4   0.9
5       A     5   0.7
6       A     6   0.6
7       A     8   0.5
8       B     1   0.5
9       B     2   0.5
10      B     3   0.6
11      B     4   0.5
12      B     5   0.4
13      B     6   0.3
14      B     8   0.3