Calculating count of active outbreaks using start and end dates in R

34 Views Asked by At

I have a dataset that looks at respiratory disease outbreaks in facilities. The outbreaks have a start and end date and indicate whether there is COVID-19, Influenza or RSV present, there can be multiple pathogens present which I will call mixed. Outbreaks are considered active from their notification date to their declaration date. My end goal is to graph the number of active outbreaks by day by pathogen present from the earliest notification date to today. I am having issues with trying to count total active outbreaks per day not just new outbreaks per day.

This is my current code

ari_test <- ari_data %>%
  select(record_id, notification_date, declaration_date, c_cov_present, c_flu_present, c_rsv_present) |> 
  mutate(notification_date = as.Date(notification_date),
         declaration_date = as.Date(declaration_date)) %>%
  filter(!is.na(notification_date) & !is.na(declaration_date)) %>%
  # Generate a sequence of dates from notification_date to declaration_date for each facility
  rowwise() %>%
  mutate(date = list(seq(notification_date, declaration_date, by = "day"))) %>%
  unnest(date) %>%
  select(-notification_date, -declaration_date) %>%
  # Count the number of active outbreaks per day for each pathogen
  group_by(date) %>%
  summarise(active_covid = sum(c_cov_present == 1 & is.na(c_flu_present) & is.na(c_rsv_present)),
            active_influenza = sum(is.na(c_cov_present) & c_flu_present == 1 & is.na(c_rsv_present)),
            active_rsv = sum(is.na(c_cov_present) & is.na(c_flu_present) & c_rsv_present == 1),
            active_mixed = sum(rowSums(cbind(c_cov_present, c_flu_present, c_rsv_present), na.rm = TRUE) >= 2))

But this is only counting the outbreaks once, they should be counted for every day they are active between their notification date and their declaration date.

I also tried this but I get an error saying record_id cannot be found even though record_id is definitely in the dataframe.

ari_test <- ari_data %>%
  mutate(notification_date = as.Date(notification_date),
         declaration_date = as.Date(declaration_date)) %>%
  filter(!is.na(notification_date) & !is.na(declaration_date)) %>%
  mutate(across(starts_with("c_"), ~if_else(is.na(.), 0, 1))) %>%  # Convert NA to 0 for presence/absence
  group_by(record_id) %>%
  mutate(active_covid = +(any(c_cov_present == 1 & is.na(c_flu_present) & is.na(c_rsv_present))),
         active_influenza = +(any(is.na(c_cov_present) & c_flu_present == 1 & is.na(c_rsv_present))),
         active_rsv = +(any(is.na(c_cov_present) & is.na(c_flu_present) & c_rsv_present == 1)),
         active_mixed = +(any(rowSums(select(., starts_with("c_"))) >= 2))) %>%
  complete(record_id, date = seq.Date(min(notification_date), max(declaration_date), by = "day"), fill = list(active_covid = 0, active_influenza = 0, active_rsv = 0, active_mixed = 0)) %>%
  ungroup()

Here is some sample data for you

structure(list(record_id = c(1, 2, 5, 6, 7, 8, 10, 11, 12, 13
), notification_date = structure(c(19523, 19524, 19535, 19535, 
19535, 19535, 19536, 19536, 19542, 19542), class = "Date"), declaration_date = structure(c(19544, 
19537, 19548, 19559, 19542, 19555, 19548, 19549, 19550, 19569
), class = "Date"), c_cov_present = c(1, 1, 1, 1, 0, 1, 1, 1, 
1, 0), c_flu_present = c(1, 0, 0, 0, 
0, 0, 0, 0, 0, 1), 
    c_rsv_present = c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

Would really appreciate some help. Thanks!

1

There are 1 best solutions below

0
Jay Bee On BEST ANSWER
# Libraries and data.

library(tidyverse)

ari_data <- structure(list(record_id = c(1, 2, 5, 6, 7, 8, 10, 11, 12, 13
), notification_date = structure(c(19523, 19524, 19535, 19535, 
19535, 19535, 19536, 19536, 19542, 19542), class = "Date"), declaration_date = structure(c(19544, 
19537, 19548, 19559, 19542, 19555, 19548, 19549, 19550, 19569
), class = "Date"), c_cov_present = c(1, 1, 1, 1, 0, 1, 1, 1, 
1, 0), c_flu_present = c(1, 0, 0, 0, 
0, 0, 0, 0, 0, 1), 
    c_rsv_present = c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

# Get the long data of illness counts for each day.

days_data <- ari_data %>%
  rowwise() %>%
  do(data.frame(record_id = .$record_id, 
                date = seq(.$notification_date, .$declaration_date, by = "days"),
                c_cov_present = .$c_cov_present,
                c_flu_present = .$c_flu_present,
                c_rsv_present = .$c_rsv_present)) %>%
  ungroup()

# Summarise the counts for plotting.

daily_counts <- days_data %>%
  group_by(date) %>%
  summarise(flu_count = sum(c_flu_present),
            covid_count = sum(c_cov_present),
            rsv_count = sum(c_rsv_present))

# Plot counts over time.

ggplot(daily_counts) +
  geom_line(aes(x = date, y = flu_count, colour = "Flu"), size = 1.5) +
  geom_line(aes(x = date, y = covid_count, colour = "COVID"), size = 1.5) +
  geom_line(aes(x = date, y = rsv_count, colour = "RSV"), size = 1.5) +
  theme_bw() +
  labs(x = "Date", 
       y = "Count of Cases")

For output:

enter image description here