How to select n values spaced as evenly as possible between the minimum and maximum in r?

148 Views Asked by At

Considering groups (gp) of n results (ngp), how to select/subset given numbers of results (nesgp) that are spaced as evenly as possible between the minimum and maximum (both necessarily included) in a new column selec?
Edit: Ideally, unselected results should appear as NA in the new selec column, not duplicated.

> print(dat, n=56)
# A tibble: 56 x 4
   gp    result   ngp nesgp
   <chr>  <dbl> <dbl> <dbl>
 1 CA      1.64    24    15
 2 CA      1.69    24    15
 3 CA      1.71    24    15
 4 CA      1.74    24    15
 5 CA      1.78    24    15
 6 CA      1.82    24    15
 7 CA      1.86    24    15
 8 CA      1.9     24    15
 9 CA      1.94    24    15
10 CA      1.98    24    15
11 CA      2.6     24    15
12 CA      2.65    24    15
13 CA      2.71    24    15
14 CA      2.76    24    15
15 CA      2.83    24    15
16 CA      2.89    24    15
17 CA      2.94    24    15
18 CA      3       24    15
19 CA      3.22    24    15
20 CA      3.42    24    15
21 CA      3.47    24    15
22 CA      3.68    24    15
23 CA      3.85    24    15
24 CA      4.38    24    15
25 ASAT    9       20    12
26 ASAT   11       20    12
27 ASAT   51       20    12
28 ASAT   61       20    12
29 ASAT   69       20    12
30 ASAT   78       20    12
31 ASAT   89       20    12
32 ASAT  102       20    12
33 ASAT  111       20    12
34 ASAT  120       20    12
35 ASAT  146       20    12
36 ASAT  163       20    12
37 ASAT  189       20    12
38 ASAT  208       20    12
39 ASAT  218       20    12
40 ASAT  304       20    12
41 ASAT  332       20    12
42 ASAT  345       20    12
43 ASAT  362       20    12
44 ASAT  402       20    12
45 ORO     0.56    12     8
46 ORO     0.7     12     8
47 ORO     0.77    12     8
48 ORO     0.78    12     8
49 ORO     0.82    12     8
50 ORO     0.82    12     8
51 ORO     0.92    12     8
52 ORO     0.94    12     8
53 ORO     1.16    12     8
54 ORO     1.46    12     8
55 ORO     1.54    12     8
56 ORO     1.77    12     8 

Data

dat <-
structure(list(gp = c("CA", "CA", "CA", "CA", "CA", "CA", "CA", 
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", 
"CA", "CA", "CA", "CA", "CA", "CA", "ASAT", "ASAT", "ASAT", "ASAT", 
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", 
"ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", "ASAT", 
"ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", "ORO", 
"ORO", "ORO", "ORO"), result = c(1.64, 1.69, 1.71, 1.74, 1.78, 
1.82, 1.86, 1.9, 1.94, 1.98, 2.6, 2.65, 2.71, 2.76, 2.83, 2.89, 
2.94, 3, 3.22, 3.42, 3.47, 3.68, 3.85, 4.38, 9, 11, 51, 61, 69, 
78, 89, 102, 111, 120, 146, 163, 189, 208, 218, 304, 332, 345, 
362, 402, 0.56, 0.7, 0.77, 0.78, 0.82, 0.82, 0.92, 0.94, 1.16, 
1.46, 1.54, 1.77), ngp = c(24, 24, 24, 24, 24, 24, 24, 24, 24, 
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 20, 
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 
20, 20, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), 
    nesgp = c(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 12, 12, 12, 
    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 
    12, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -56L)) 

Thanks for help.

2

There are 2 best solutions below

6
On BEST ANSWER

I'm not sure what you mean by "spaced as evenly as possible" but I wrote an example that uses sampling of # of points to minimize the spread between their deltas that could be a good starting point for you:

par(mfrow = c(length(unique(dat$gp)), 1))
dat$selec <- NA
# for each group, 
groups <- unique(dat$gp)
for(gp in groups){
  x <- dat$result[dat$gp == gp]
  minmax_x <- range(x)
  possible_xs <- x[!(x %in% minmax_x)]
  # run a lot of samples of different possible lengths to test
  r <- replicate(20000, sort(c(minmax_x,
                               sample(possible_xs, 
                                      size = sample(3:length(possible_xs),1)
                                      )
                               )
                             )
                 )

  spreads <- sapply(r, function(obj) var(diff(obj)))
  minimized_variance_index <- which.min(spreads)
  dat$selec[which(dat$result %in% r[[minimized_variance_index]])] <- 1
  # visualize
  plot(x, rep(1, length(x)), yaxt = "n", ylab = "", xlab = "result",
       main = paste(gp,", spread =", round(var(diff(r[[minimized_variance_index]])),5)))
  abline(v= r[[minimized_variance_index]])
}

There are not as many points selected in this case as what you seem to be looking for. enter image description here

Updated as per comment to show another approach.

If you would like to first determine an ideal distribution based on an evenly spread number of points, you'll just have to come up with that arbitrary number num_intervals <- length(x)-1

Here are the functions that make the coding a little easier

create_equal_spaced_intervals <- function(x_values, num_intervals){
  intervals <- seq(from = min(x_values), to = max(x_values), length.out = num_intervals)
  names(intervals) <- paste0("interval",1:num_intervals)
  return(intervals)
}

snap_closest_x_to_closest_y <- function(x_values, y_values){
  rowMins <- function(a) apply(a, 1, function(b) which.min(b))
  colMins <- function(a) apply(a, 2, function(b) which.min(b))
  absolute_dist_matrix <- abs(outer(x_values, y_values, "-"))
  snapped_Ys <- unique(rowMins(absolute_dist_matrix))
  snapped_Xs <- colMins(absolute_dist_matrix[,snapped_Ys])
  return(x_values[snapped_Xs]) 
}

corr_of_var_fn <- function(x) round(sd(diff(x))/mean(diff(x)), 4)

And here is how to go about performing the algo

# ANALYSIS BY GP
dat_by_gp <- split(dat, dat$gp, drop= T)
spread_results_by_gp <- vector("list", length(dat_by_gp))
for(i in 1:length(dat_by_gp)){
  subdat <- dat_by_gp[[i]]
  subdat$selec <- NA
  no_dupes <- which(!duplicated(subdat$result))
  vec <- subdat$result[no_dupes]
  n <- length(vec)
  
  spread_results <- rep(NA, n)
  # identify the best interval to use
  # by iterating from 3 to the size
  # can change the 3 though..
  for(num_intervals in 3:n){
    intervals <- create_equal_spaced_intervals(vec, num_intervals)
    selec <- snap_closest_x_to_closest_y(x_values = vec, y_values = intervals)

    # measure result
    spread_results[num_intervals] <- corr_of_var_fn(selec)
  }
  # get the MOST EVEN result
  number_of_intervals <- which.min(spread_results)
  selec <- snap_closest_x_to_closest_y(vec, create_equal_spaced_intervals(vec, number_of_intervals))
  # assign back to the matrix
  index <- which(subdat$result[no_dupes] %in% selec)
  subdat$selec[no_dupes][index] <- 1
  spread_results_by_gp[[i]] <- spread_results
  dat_by_gp[[i]] <- subdat
  
  cat(subdat$gp[1], "Using ", number_of_intervals, 
      " intervals which produces a spread of ", spread_results[which.min(spread_results)], 
      "and ", length(selec), "results\n")
}
# and you could overwrite your dat object by using these values
dat$selec <- do.call(rbind, dat_by_gp)$selec

We can also visualize the results by doing the following

# visualize individually below
plot_individual_interval_comparison <- function(x){
  default_plot_params <- par(no.readonly = TRUE)
  vec <- x[!duplicated(x)]
  n <- length(vec)
  spread_results <- rep(NA, n-2)
  par(mfrow = c(n-2, 1), mar = c(0,6,0,0), oma = c(3,1,1,1), las = 2)
  for(num_intervals in 3:n){
    intervals <- create_equal_spaced_intervals(vec, num_intervals)
    selec <- snap_closest_x_to_closest_y(x_values = vec, y_values = intervals)
    # measure result
    corr_of_var = corr_of_var_fn(selec)
    spread_results[num_intervals] <- corr_of_var
    # visualize
    plot(x, rep(1, length(x)), xaxt = "n", yaxt = "n", xlab = "", ylab = "")
    mtext(paste("intervals=",num_intervals,"\n","spread=",corr_of_var), side = 2,line = 1, cex = .6)
    abline(v = intervals, col = 'gray', lty = 1, lwd = 1)
    abline(v = selec, col = 'blue', lty = 2, lwd = 2)
  }
  par(default_plot_params)
}

# making the plots

plot_individual_interval_comparison(dat$result[dat$gp == "CA"])
plot_individual_interval_comparison(dat$result[dat$gp == "ASAT"])
plot_individual_interval_comparison(dat$result[dat$gp == "ORO"])

par(mfrow= c(1,3))
plot(spread_results_by_gp[[1]], main = "CA", ylab = "spread", type = 'o')
plot(spread_results_by_gp[[2]], main = "ASAT", ylab = "spread", type = 'o')
plot(spread_results_by_gp[[3]], main = "ORO", ylab = "spread", type = 'o')

You'll notice this approach doesn't give you quite the same visual even spread as the previous approach.

enter image description here

1
On

The following solution uses an auxiliary function fun, maybe there are simpler ways of doing the same.
The function creates a sequence of n values and uses findInterval to see where in x those values lie. Then, in the for loop, checks the distances to the extremes of each interval and assigns the smallest to the return value y.

suppressPackageStartupMessages(
  library(tidyverse)
)

fun <- function(x, n, na.rm = FALSE) {
  xmin <- min(x, na.rm = na.rm)
  xmax <- max(x, na.rm = na.rm)
  ref <- seq(xmin, xmax, length.out = n)
  x <- sort(x)
  j <- findInterval(ref, x)
  y <- numeric(n)
  y[1L] <- xmin
  y[n] <- xmax
  for(i in seq_len(n)[-c(1L, n)]) {
    if(abs(ref[i] - x[ j[i] ]) < abs(ref[i] - x[ j[i + 1L] ])) {
      y[i] <- x[ j[i] ]
    } else y[i] <- x[ j[i + 1L] ]
  }
  y
}
dat %>%
  reframe(selec = fun(result, first(ngp)), .by = gp)
#> # A tibble: 56 × 2
#>    gp    selec
#>    <chr> <dbl>
#>  1 CA     1.64
#>  2 CA     1.74
#>  3 CA     1.86
#>  4 CA     1.98
#>  5 CA     1.98
#>  6 CA     1.98
#>  7 CA     1.98
#>  8 CA     1.98
#>  9 CA     2.71
#> 10 CA     2.71
#> # ℹ 46 more rows

Created on 2024-02-04 with reprex v2.0.2