Add data from another dataframe based on nearest spatial location (lon, lat) and datetime

149 Views Asked by At

I have two very large datasets. Dataset 1 has sampling locations with lat, lon and date. Like this:

           datetime longitude latitude
2022-08-10 03:26:08  147.8521 -20.2443
2020-10-02 16:12:52  152.3652 -23.1234

Dataset 2 has temperature data that I have extracted from a netCDF file. This dataframe is much bigger in terms of spatial extent and data records (as it includes data for many irrelevant locations and every day). Like this:

           datetime longitude latitude temp
2022-08-10 12:00:00 147.8601 -20.2423 20.62
2022-08-10 04:30:08 147.8601 -20.2423 21.49
2022-08-11 09:10:23 152.3633 -23.1225 21.55
2020-10-02 16:12:52 152.4213 -23.1562 20.80
2020-10-02 16:12:52 153.4213 -24.1562 21.10
2020-11-01 12:00:00 152.4213 -23.1562 21.33

I would like to find the nearest neighbour from Dataset 2 based on nearest location (lat, lon) first and foremost, then nearest datetime, to populate a new column ('temp') in Dataset 1.

I would like the end result to look like this:

           datetime longitude latitude temp
2022-08-10 03:26:08  147.8521 -20.2443 21.49
2020-10-02 16:12:52  152.3652 -23.1234 20.80

I am competent at matching values from different datasets, however, matching based on nearest neighbours in space and time is beyond my skill level.

After hours of fruitless searching I have come up with no answers. Can anyone help?

2

There are 2 best solutions below

4
Umar On

Here sp::spDists function is used to calculates spatial distances between two datasets (df1 and df2). It then iterates through each point in df1, identifying the nearest neighbor in df2 based on both spatial and temporal proximity. The criteria (max_acceptable_distance and max_acceptable_time_difference) ensure accurate matches.

set.seed(123)
df1 <- data.frame(
  datetime = as.POSIXct(c("2022-08-10 03:26:08", "2020-10-02 16:12:52")),
  longitude = c(147.8521, 152.3652),
  latitude = c(-20.2443, -23.1234)
)

df2 <- data.frame(
  datetime = as.POSIXct(c("2022-08-10 12:00:00", "2022-08-10 04:30:08", "2022-08-11 09:10:23", "2020-10-02 16:12:52", "2020-11-01 12:00:00")),
  longitude = c(147.8601, 147.8601, 152.3633, 152.4213, 152.4213),
  latitude = c(-20.2423, -20.2423, -23.1225, -23.1562, -23.1562),
  temp = c(20.62, 21.49, 21.55, 20.80, 21.33)
)

library(sf)
library(dplyr)

# Convert dataframes to spatial data frames
df1_sf <- st_as_sf(df1, coords = c("longitude", "latitude"), crs = 4326)
df2_sf <- st_as_sf(df2, coords = c("longitude", "latitude"), crs = 4326)

# Compute spatial distances between points
spatial_distances <- sp::spDists(as.matrix(st_coordinates(df1_sf)), as.matrix(st_coordinates(df2_sf)))

# Convert datetime strings to POSIXct
df1$datetime <- as.POSIXct(df1$datetime)
df2$datetime <- as.POSIXct(df2$datetime)

# Initialize an empty vector to store temperature values
df1$temp <- numeric(nrow(df1))

# Find nearest neighbors based on both spatial and temporal distances
for (i in seq_along(df1$temp)) {
  row <- spatial_distances[i, ]
  min_dist <- which.min(row)
  
  # Check if the minimum distance is within acceptable range
  if (row[min_dist] < max_acceptable_distance) {
    time_diff <- abs(difftime(df1$datetime[i], df2$datetime))
    min_time_diff <- which.min(time_diff)
    
    # Check if the minimum time difference is within acceptable range
    if (time_diff[min_time_diff] < max_acceptable_time_difference) {
      df1$temp[i] <- df2$temp[min_time_diff]
    }
  }
}



    print(df1)


             

    datetime longitude latitude  temp
1 2022-08-10 03:26:08  147.8521 -20.2443 21.49
2 2020-10-02 16:12:52  152.3652 -23.1234 20.80

but if i use this code, gives slightly different results

    # Convert dataframes to sf data frames
df1_sf <- st_as_sf(df1, coords = c("longitude", "latitude"), crs = 4326)
df2_sf <- st_as_sf(df2, coords = c("longitude", "latitude"), crs = 4326)

# Initialize an empty vector to store temperature values
df1$temp <- numeric(nrow(df1))
# Calculate distances and ranks
dd <- outer(seq_len(nrow(df1)), seq_len(nrow(df2)), Vectorize(fdist))
dt <- abs(outer(df1$datetime, df2$datetime, difftime))
dmin <- dd == matrixStats::rowMins(dd)
trk <- matrixStats::rowRanks(dt)

# Find the index of the minimum rank
indices <- vapply(seq_len(nrow(df1)), function(i) 
  which.min(trk[i, dmin[i, ]]), numeric(1))

# Assign the temp values from df2 to df1 based on the nearest index
df1$temp <- df2$temp[indices]

# Print the result
print(df1)

             datetime longitude latitude  temp
1 2022-08-10 03:26:08  147.8521 -20.2443 21.49
2 2020-10-02 16:12:52  152.3652 -23.1234 20.62

Another way u can get it by

library(sf)
library(dplyr)

# Convert dataframes to sf data frames
df1_sf <- st_as_sf(df1, coords = c("longitude", "latitude"), crs = 4326)
df2_sf <- st_as_sf(df2, coords = c("longitude", "latitude"), crs = 4326)

# Initialize an empty vector to store temperature values
df1$temp <- numeric(nrow(df1))

# Find the closest match in df2 for each point in df1 based on location and datetime
for (i in seq_along(df1$temp)) {
  df2_sorted <- df2_sf[order(abs(df2_sf$datetime - df1_sf$datetime[i]), st_distance(df2_sf, df1_sf[i, ])), ]
  df1$temp[i] <- df2_sorted$temp[1]  # Take the temperature of the closest match
}

# Extract only the required columns from the result
df1_result <- df1[c("datetime", "longitude", "latitude")]
df1_result$temp <- df1$temp
print(df1_result)
   datetime longitude latitude  temp
1 2022-08-10 03:26:08  147.8521 -20.2443 21.49
2 2020-10-02 16:12:52  152.3652 -23.1234 20.80
2
jay.sf On

We could use geosphere::distHaversine. Put it in outer and do similar with the datetimes using difftime which gives you matrices of differences. Next we look for which rows of distance matrix dd are the rowMins. To break ties we subset the rowRanks of the time distance matrix by dd and look for which.min.

> fdist <- \(x, y) geosphere::distHaversine(df1[x, c('longitude', 'latitude')], 
+                                           df2[y, c('longitude', 'latitude')])
> dd <- outer(seq_len(nrow(df1)), seq_len(nrow(df2)), Vectorize(fdist))
> dt <- abs(outer(df1$datetime, df2$datetime, difftime))
> dmin <- dd == matrixStats::rowMins(dd)
> trk <- matrixStats::rowRanks(dt)
> (slc <- vapply(seq_len(nrow(df1)), \(i) {
+   w <- which(dmin[i, ])
+   w[which.min(trk[i, w])]
+ }, numeric(1)))
[1] 2 3  ## selected rows of df2
> res <- df2$temp[slc]
> cbind(df1, temp=res)
             datetime longitude latitude  temp
1 2022-08-10 03:26:08  147.8521 -20.2443 21.49
2 2020-10-02 16:12:52  152.3652 -23.1234 21.55

Note that I get a different result than you. We can cbind the stuff for verification:

> cbind(df2, t(dd), dt=unclass(t(dt)))  ## haversine and time distances to rows 1 and 2 of df1
             datetime longitude latitude  temp           1           2     dt.1     dt.2
1 2022-08-10 12:00:00  147.8601 -20.2423 20.62    864.7013 565645.9494    30832 58477628
2 2022-08-10 04:30:08  147.8601 -20.2423 21.49    864.7013 565645.9494     3840 58450636
3 2022-08-11 09:10:23  152.3633 -23.1225 21.55 565981.5306    218.8008   107055 58553851
4 2020-10-02 16:12:52  152.4213 -23.1562 20.80 573002.7003   6805.0961 58446796        0
5 2020-10-02 16:12:52  153.4213 -24.1562 21.10 720336.1492 157533.5176 58446796        0
6 2020-11-01 12:00:00  152.4213 -23.1562 21.33 573002.7003   6805.0961 55866368  2580428

Data:

> dput(df1)
structure(list(datetime = structure(c(1660094768, 1601647972), class = c("POSIXct", 
"POSIXt"), tzone = ""), longitude = c(147.8521, 152.3652), latitude = c(-20.2443, 
-23.1234)), class = "data.frame", row.names = c(NA, -2L))
> dput(df2)
structure(list(datetime = structure(c(1660125600, 1660098608, 
1660201823, 1601647972, 1601647972, 1604228400), class = c("POSIXct", 
"POSIXt"), tzone = ""), longitude = c(147.8601, 147.8601, 152.3633, 
152.4213, 153.4213, 152.4213), latitude = c(-20.2423, -20.2423, 
-23.1225, -23.1562, -24.1562, -23.1562), temp = c(20.62, 21.49, 
21.55, 20.8, 21.1, 21.33)), class = "data.frame", row.names = c(NA, 
-6L))