Calculate Probit Estimates for samples via Loop

69 Views Asked by At

I have a sample dataset that looks like this:

library(dplyr)
test_df <- structure(list(test = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L), test2 = c(34.6, 
53.3, 53.9, 71.5, 40.6, 15.3, 34, 62.7, 5.7, 85.2, 21.3, 53.9, 
13.7, 32.5, 62.1, 25.6, 63.5, 48.6, 93.8, 85.7, 37.1, 31.4, 82.8, 
45.2, 31.6, 9.8, 6.5, 68.9, 66.8, 90.4, 30.2, 93.3, 20.2, 79.2, 
22.5, 3.1, 86.2, 68.5, 94.2, 67.6, 84.3, 36.2, 39.2, 56.8, 9.5, 
19.4, 58.8, 75.1, 86.7, 37.2, 79.9, 5.8, 62.3, 35.7, 58.8, 91.4, 
20, 36.9, 67.1, 76.8, 52.2, 82.8, 52.7, 50.2, 42, 36.2, 12.4, 
29.8, 27.7, 77, 77.8, 14.4, 51.6, 59.7, 50.6, 38.6, 42.6, 1.2, 
91.9, 8, 50.7, 82, 59.8, 42.4, 55.9, 78.9, 16.8, 97, 47.4, 93, 
90.1, 75.1, 67.7, 64.8, 7.3, 42.4, 53.1, 94.3, 71.2, 72.4, 34.6, 
53.3, 53.9, 71.5, 40.6, 15.3, 34, 62.7, 5.7, 85.2, 21.3, 53.9, 
13.7, 32.5, 62.1, 25.6, 63.5, 48.6, 93.8, 85.7, 37.1, 31.4, 82.8, 
45.2, 31.6, 9.8, 6.5, 68.9, 66.8, 90.4, 30.2, 93.3, 20.2, 79.2, 
22.5, 3.1, 86.2, 68.5, 94.2, 67.6, 84.3, 36.2, 39.2, 56.8, 9.5, 
19.4, 58.8, 75.1, 86.7, 37.2, 79.9, 5.8, 62.3, 35.7, 58.8, 91.4, 
20, 36.9, 67.1, 76.8, 52.2, 82.8, 52.7, 50.2, 42, 36.2, 12.4, 
29.8, 27.7, 77, 77.8, 14.4, 51.6, 59.7, 50.6, 38.6, 42.6, 1.2, 
91.9, 8, 50.7, 82, 59.8, 42.4, 55.9, 78.9, 16.8, 97, 47.4, 93, 
90.1, 75.1, 67.7, 64.8, 7.3, 42.4, 53.1, 94.3, 71.2, 72.4), test3 = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L)), class = "data.frame", row.names = c(NA, -200L
))

I am now trying to calculate Probit-Estimates via glm. I want to calculate the same model a few times always choosing a different sample from the dataset based on test3. So far I got

id <- unique(test_df$test3)
for (i in 1:5)
{
  random[i] <- sample(id,3)
  data[i] <- test_df %>% filter(id %in% random[i])
  model <- glm(test~test2+test3,family = binomial(link = "probit"),data = data[i])
  res <- summary(model)
  results[i] <- res$coefficients
}

However, this returns

Error: Problem with `filter()` input `..1`.
x Input `..1` must be of size 200 or 1, not size 10.
ℹ Input `..1` is `id %in% random[i]`.

Does anyone know how to fix this so that in the end I get 5 different coefficients for every variable? Thanks a lot!

2

There are 2 best solutions below

0
On

Normally loops that repeat analytic processes fail because the inner logic does not have indexing on the LHS of assignments. Here it fails because of those indices which are not initialized or might have been but perhaps were indexed improperly: Try this modification:

id <- unique(test_df$test3); results <- list()
for (i in 1:5)
{
    random <- sample(id,3)
    data1 <- test_df %>% filter(test3 %in% random) #note the need to use a column name
    model <- glm(test~test2+test3,family = binomial(link = "probit"),data = data1)
    res <- summary(model)
    results[[i]] <- res$coefficients # note need to use `[[` rather than `[`
}

str(results)
List of 5
 $ : num [1:3, 1:4] 6.85 8.02e-12 5.50e-10 2.18e+04 2.37e+02 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "(Intercept)" "test2" "test3"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
 $ : num [1:3, 1:4] 3.35e+01 -3.41e-04 -1.34e+01 2.49e+04 1.73e+02 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "(Intercept)" "test2" "test3"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
 $ : num [1:3, 1:4] -0.414455 -0.000279 -0.000755 0.59573 0.006827 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "(Intercept)" "test2" "test3"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
 $ : num [1:3, 1:4] -1.54385 -0.00582 0.24211 0.7596 0.00707 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "(Intercept)" "test2" "test3"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
 $ : num [1:3, 1:4] -0.62463 0.00318 0.23255 0.48835 0.00699 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:3] "(Intercept)" "test2" "test3"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
0
On

Does this help?

library(tidyverse)
library(broom)


id <- unique(test_df$test3)

results <- tibble(i=NA, results=NA)

for (i in 1:5)
{ 

  random <- sample(id, 3)

  
  data <- test_df %>% filter(test3 %in% random)
  model <- glm(test~test2+test3,family = binomial(link = "probit"), data = data)
  
  res <- tibble(i=i, results=tidy(model))
  
  results <- bind_rows(results, res)
  
}
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

results
#> # A tibble: 16 x 2
#>        i results$term $estimate  $std.error $statistic $p.value
#>    <int> <chr>            <dbl>       <dbl>      <dbl>    <dbl>
#>  1    NA <NA>         NA           NA        NA        NA      
#>  2     1 (Intercept)  -2.95e+ 1 29910.       -9.88e- 4  0.999  
#>  3     1 test2         2.37e- 3   282.        8.41e- 6  1.00   
#>  4     1 test3         4.54e+ 0  4455.        1.02e- 3  0.999  
#>  5     2 (Intercept)  -6.85e+ 0 15834.       -4.32e- 4  1.00   
#>  6     2 test2        -3.83e-17   211.       -1.82e-19  1      
#>  7     2 test3        -5.52e-16  1732.       -3.19e-19  1      
#>  8     3 (Intercept)  -1.34e+ 0     0.610    -2.20e+ 0  0.0277 
#>  9     3 test2        -5.49e- 3     0.00704  -7.80e- 1  0.436  
#> 10     3 test3         2.20e- 1     0.0834    2.64e+ 0  0.00827
#> 11     4 (Intercept)  -6.85e+ 0 15061.       -4.55e- 4  1.00   
#> 12     4 test2        -2.94e-18   224.       -1.32e-20  1      
#> 13     4 test3         4.95e-16  1821.        2.72e-19  1      
#> 14     5 (Intercept)  -3.52e- 1     0.585    -6.01e- 1  0.548  
#> 15     5 test2        -1.68e- 3     0.00678  -2.48e- 1  0.804  
#> 16     5 test3         2.52e- 1     0.114     2.21e+ 0  0.0272

Created on 2020-12-10 by the reprex package (v0.3.0)