Using the dplyr coalesce function for multiple rows from data frame R

81 Views Asked by At

I have several rows of data that I would like to coalesce using the R function. How do I use it on this set of 5 rows of my data? I think the issue is that the coalesce function doesn't recognize each row as a vector. I've tried transforming this into five lists, but it all seems so inefficient.

library(tidyverse)
my_data <- structure(list(Station_Number = c("0001", "0001", "0001", "0001","0001"), 
Station_Name = c("Ithaca 3E", "Ithaca 3E", "Ithaca 3E","Ithaca 3E", "Ithaca 3E"), NWSLI 
= c("ITMN1", "ITMN1", "ITMN1", "ITMN1", "ITMN1"), Station_Legacy_Name = c("Mead", Mead", 
"Mead", "Mead", "Mead"), Lat = c(41.1528611, 41.1528611, 41.1528611, 41.1528611, 
41.1528611), Lon = c(-96.4914444, -96.4914444, -96.4914444, -96.4914444, -96.4914444), 
Elev_m = c("353.7", "353.7", "353.7", "353.7", "353.7"), TIMESTAMP = c("2015-04-16 
10:00:00", "2015-04-16 10:00:00", "2015-04-16 10:00:00", "2015-04-16 10:00:00", "2015- 
04-16 10:00:00"), RECORD = c(0L, 0L, 0L, 0L, 0L), Ms_veg_10cm = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), Ms_veg_50cm = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), Ms_veg_100cm = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), 
Ms_veg_25cm = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Ta_2m_Avg = 
c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), RH_2m_Avg = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), Ts_bare_10cm_Avg = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), Solar_2m_Avg = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), 
WS_ms_S_WVT = c(0.698, NA, NA, NA, NA), WS_ms_U_WVT = c(0.586, NA, NA, NA, NA), 
WindDir_DU_WVT = c(212.4, NA, NA, NA, NA), WindDir_SDU_WVT = c(23.94, NA, NA, NA, NA), 
Rain_1m_Tot = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), Pres_2m = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), BattVolts_Min = c(13.28, NA, NA, NA, NA), Ta_C_Avg = c(17.83, 
NA, NA, NA, NA), RH_Avg = c(66.69, NA, NA, NA, NA), Ts_C_Avg = c(11.61, 
NA, NA, NA, NA), Solar_Avg = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), Theta_10 = c(NA, -4.73, NA, NA, NA), 
Theta_25 = c(NA, NA, -4.848, NA, NA), Theta_50 = c(NA, NA, 
NA, -4.932, NA), Theta_100 = c(NA, NA, NA, NA, -4.949), Rain_mm_Tot = c(0, 
NA, NA, NA, NA), BP_mbar = c(976.4229, NA, NA, NA, NA), source = c("E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table101-dat (2).backup", 
"E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table301-dat", 
"E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table302-dat", 
"E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table303-dat", 
"E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table304-dat"
)), row.names = c(44257L, 74695L, 76522L, 78349L, 80176L), class = "data.frame")

I want one data row that looks like this:

structure(list(Station_Number = "0001", Station_Name = "Ithaca 3E", 
NWSLI = "ITMN1", Station_Legacy_Name = "Mead", Lat = 41.1528611, 
Lon = -96.4914444, Elev_m = "353.7", TIMESTAMP = "2015-04-16 10:00:00", 
RECORD = 0L, Ms_veg_10cm = NA_real_, Ms_veg_50cm = NA_real_, 
Ms_veg_100cm = NA_real_, Ms_veg_25cm = NA_real_, Ta_2m_Avg = NA_real_, 
RH_2m_Avg = NA_real_, Ts_bare_10cm_Avg = NA_real_, Solar_2m_Avg = NA_real_, 
WS_ms_S_WVT = 0.698, WS_ms_U_WVT = 0.586, WindDir_DU_WVT = 212.4, 
WindDir_SDU_WVT = 23.94, Rain_1m_Tot = NA_real_, Pres_2m = NA_real_, 
BattVolts_Min = 13.28, Ta_C_Avg = 17.83, RH_Avg = 66.69, 
Ts_C_Avg = 11.61, Solar_Avg = NA_real_, Theta_10 = -4.73, 
Theta_25 = NA_real_, Theta_50 = NA_real_, Theta_100 = NA_real_, 
Rain_mm_Tot = 0, BP_mbar = 976.4229, source = "E:\\Data_Collected_withDups/Mead/NE-Mead-CR1000-Table101-dat (2).backup"), row.names = 1L, class = "data.frame")

I've tried:

coalesce(unlist(my_data[1:5,]))

and other versions of this, but to no luck. How would I get this to coalesce down to what I want using coalesce, or any other method for that matter?

2

There are 2 best solutions below

1
Jon Spring On BEST ANSWER

This will fill any non-NA's down into any NA's below. Then we can take the last row.

my_data |>
  tidyr::fill(everything()) |>
  tail(1)

Result

      Station_Number Station_Name NWSLI Station_Legacy_Name      Lat       Lon Elev_m           TIMESTAMP RECORD Ms_veg_10cm Ms_veg_50cm Ms_veg_100cm Ms_veg_25cm Ta_2m_Avg RH_2m_Avg Ts_bare_10cm_Avg Solar_2m_Avg WS_ms_S_WVT WS_ms_U_WVT WindDir_DU_WVT WindDir_SDU_WVT Rain_1m_Tot Pres_2m BattVolts_Min Ta_C_Avg RH_Avg Ts_C_Avg Solar_Avg Theta_10 Theta_25 Theta_50 Theta_100 Rain_mm_Tot  BP_mbar
80176           0001    Ithaca 3E ITMN1                Mead 41.15286 -96.49144  353.7 2015-04-16 10:00:00      0          NA          NA           NA          NA        NA        NA               NA           NA       0.698       0.586          212.4           23.94          NA      NA         13.28    17.83  66.69    11.61        NA    -4.73   -4.848   -4.932    -4.949           0 976.4229
0
Nicholas Ray On

This may be difficult to generalize, but this will achieve what you're after.

my_data <- data.frame(
     Station_Number = c("0001", "0001", "0001", "0001","0001"),
     Station_Name = c("Ithaca 3E", "Ithaca 3E", "Ithaca 3E","Ithaca 3E",
                      "Ithaca 3E"),
     NWSLI = c("ITMN1", "ITMN1", "ITMN1", "ITMN1", "ITMN1"),
     Station_Legacy_Name = c("Mead","Mead","Mead", "Mead", "Mead"),
     Lat = c(41.1528611, 41.1528611, 41.1528611, 41.1528611, 41.1528611),
     Lon = c(-96.4914444, -96.4914444, -96.4914444, -96.4914444, -96.4914444),
     Elev_m = c("353.7", "353.7", "353.7", "353.7", "353.7"),
     TIMESTAMP = c("2015-04-16 10:00:00", "2015-04-16 10:00:00",
                   "2015-04-16 10:00:00", "2015-04-16 10:00:00",
                   "2015-04-16 10:00:00"),
     RECORD = c(0L, 0L, 0L, 0L, 0L),
     Ms_veg_10cm = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_),
     Ms_veg_50cm = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_),
     Ms_veg_100cm = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
     Ms_veg_25cm = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
     Ta_2m_Avg = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
     RH_2m_Avg = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_),
     Ts_bare_10cm_Avg = c(NA_real_, NA_real_, NA_real_,NA_real_, NA_real_),
     Solar_2m_Avg = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
     WS_ms_S_WVT = c(0.698, NA, NA, NA, NA),
     WS_ms_U_WVT = c(0.586, NA, NA, NA, NA),
     WindDir_DU_WVT = c(212.4, NA, NA, NA, NA),
     WindDir_SDU_WVT = c(23.94, NA, NA, NA, NA),
     Rain_1m_Tot = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
     Pres_2m = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), 
     BattVolts_Min = c(13.28, NA, NA, NA, NA), 
     Ta_C_Avg = c(17.83,NA, NA, NA, NA),
     RH_Avg = c(66.69, NA, NA, NA, NA),
     Ts_C_Avg = c(11.61,NA, NA, NA, NA),
     Solar_Avg = c(NA_real_, NA_real_, NA_real_,NA_real_, NA_real_),
     Theta_10 = c(NA, -4.73, NA, NA, NA),
     Theta_25 = c(NA, NA, -4.848, NA, NA),
     Theta_50 = c(NA, NA,NA, -4.932, NA),
     Theta_100 = c(NA, NA, NA, NA, -4.949),
     Rain_mm_Tot = c(0,NA, NA, NA, NA),
     BP_mbar = c(976.4229, NA, NA, NA, NA),
     row.names = c(44257L, 74695L, 76522L, 78349L, 80176L)
)
library(zoo)
for (i in 1:ncol(my_data)) {
  my_data[,i]<-ifelse(is.na(my_data[1,i]),na.locf(my_data[,i],fromLast=TRUE),
                      na.locf(my_data[,i]))
}
my_data<-my_data[1,]

This hinges on the fact that the columns you wanted to condense (or coalesce) all only have one observation. You can just fill the other (empty) rows with that one observation and then just select one row from that data frame.