R caret's train with poly of a given degree

310 Views Asked by At

I'm rolling a loop over the degree of the approximating polynomial for training with caret

ds = 1:20
for(i in 1:length(ds)){
  print(i)
  d=ds[i]
  fit = train(y~poly(x,degree=d),data=training,method="lm",trControl=fitCtrl)
  # other operations
}

running the code gives

Error in `[.data.frame`(data, 0, cols, drop = FALSE) : 
undefined columns selected

using d=4 doesn't work, but fixing the degree in the call, i.e. degree=4, works.

Any guess of what's going on here?

Thanks!

EDIT:

library(caret)
set.seed(1)
omega = 0.5*pi
xi = 0.5
phi = 0.5*pi
f = function(t)1-exp(-xi*omega*t)*sin(sqrt(1-xi^2)*omega*t+phi)/sin(phi)
sigma = 0.03
train.n = 100
x = seq(0,2*pi,by=2*pi/(train.n-1))
y = f(x)+rnorm(train.n,mean=0,sd=sigma)
training = data.frame(x=x,y=y)
fitCtrl <- trainControl(method = "LOOCV",verboseIter = FALSE)
ds = 1:20
for(i in 1:length(ds)){
  print(i)
  d=4
fit=train(y~poly(x,degree=4),data=training,method="lm",trControl=fitCtrl)
}
2

There are 2 best solutions below

0
On

The problem here is actually that caret is using all.vars() on your formula under the hood to create the dataframe needed for modeling. As you can see, d is thought to be one of these variables.

all.vars(y~poly(x, degree = d))
#> [1] "y" "x" "d"

Typically, one could solve these issues with the use of I() in the formula or force() around it, but not with all.vars().

The only way to fix this is to not send in d in your formula, but have it be a number beforehand.

Using as.formula(paste0("y ~ poly(x, degree=", d, ")") in your loop will achieve this (as also suggested by @akrun).

Here is a working example based on your code:

library(caret)
set.seed(1)
omega = 0.5 * pi
xi = 0.5
phi = 0.5 * pi
f = function(t)
  1 - exp(-xi * omega * t) * sin(sqrt(1 - xi ^ 2) * omega * t + phi) / sin(phi)
sigma = 0.03
train.n = 100
x = seq(0, 2 * pi, by = 2 * pi / (train.n - 1))
y = f(x) + rnorm(train.n, mean = 0, sd = sigma)
training = data.frame(x = x, y = y)
fitCtrl <- trainControl(method = "LOOCV", verboseIter = FALSE)

# Shorterning this to 2 to illustrate that it's working
ds = 1:2

for (i in 1:length(ds)) {
  print(i)
  d = ds[i]
  
  fit = train(
    as.formula(paste0("y ~ poly(x, degree=", d, ")")),
    data = training,
    method = "lm",
    trControl = fitCtrl
  )
  
}
#> [1] 1
#> [1] 2

# Notice that we have two poly-terms, so it's working
summary(fit)
#> 
#> Call:
#> lm(formula = .outcome ~ ., data = dat)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.42507 -0.11917  0.00392  0.13771  0.27425 
#> 
#> Coefficients:
#>                        Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)             0.94738    0.01655  57.237  < 2e-16 ***
#> `poly(x, degree = 2)1`  1.10450    0.16552   6.673 1.56e-09 ***
#> `poly(x, degree = 2)2` -1.62076    0.16552  -9.792 3.77e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.1655 on 97 degrees of freedom
#> Multiple R-squared:  0.5914, Adjusted R-squared:  0.583 
#> F-statistic: 70.21 on 2 and 97 DF,  p-value: < 2.2e-16

Created on 2022-03-20 by the reprex package (v2.0.1)

1
On

We may use paste to create the formula here

d <- 4
train(as.formula(paste0('y ~ poly(x, degree =', d, ')')), 
      data = training, method = "lm", trControl = fitCtrl)

-output

Linear Regression 

100 samples
  1 predictor

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 99, 99, 99, 99, 99, 99, ... 
Resampling results:

  RMSE        Rsquared   MAE       
  0.03790195  0.9779768  0.02937452

With the loop, we may need to store the output in a list

ds <- 1:20
fitlst <- vector('list', length(ds))
for(i in seq_along(ds)){
  print(i)
  d <- ds[i]
  
  fitlst[[i]] <- train(as.formula(paste0('y ~ poly(x, degree =', d, ')')), 
      data = training, method = "lm", trControl = fitCtrl)
      }

-output

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
> fitlst[[4]]
Linear Regression 

100 samples
  1 predictor

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 99, 99, 99, 99, 99, 99, ... 
Resampling results:

  RMSE        Rsquared   MAE       
  0.03790195  0.9779768  0.02937452

Tuning parameter 'intercept' was held constant at a value of TRUE