I am new to template model builder (TMB), and I am trying to use it to fit a simple linear regression with errors in variables as a learning exercise. My ultimate goal is to apply this to more complex models. I am using the formulation:
and the followng code to simulate data and generate the analysis:
#
# The following code examines the error in variables case of simple linear
# regression using maximum likelihood.
#
#-------------- Prepare Workspace: clear and load packages --------------------
#
rm(list=ls())
library(MASS) # for mvnorm()
library(TMB) # for Template Model Builder (TMB)
library(tidyverse)
#-------------------- Define model parameters ---------------------------------
#
model_name <-"eiv_regression"
set.seed(42) # random number seed to fix the sequence for replication
b0 <- 10.0 # regression intercept
b1 <- 2.0 # regression slope
sigma_Y <- 5.0 # observation error on Y
lambda <- 1 # observation error ratio: sigma_X/sigma_Y
# sigma_X = lambda * sigma_Y
N <- 1000 # number of observations (use lots to test)
#-------------------- Create Simulated Data Set -------------------------------
#
# simulate observation errors in x and Y
epsilon <- rnorm(N,0,sigma_Y) # errors in Y
delta <- rnorm(N,0,(lambda*sigma_Y)) # errors in X
# Simulate observed values of predictor variable
X_true <- runif(N,20,80)
X_obs <- X_true + delta
# Simulate observations as "true" value + Y error
Y_true <- b0 + b1*X_true
Y_obs <- Y_true + epsilon
# Make sure our estimates look like we think they should
plot(X_obs,Y_obs,pch=19,col="grey")
# Add the line estimated if we ignore errors in variables
lm_model <- lm(Y_obs~X_obs)
abline(b0,b1,lwd=3)
abline(coef(lm_model)[1],coef(lm_model)[2], lwd=3, lty=2)
# Print "naive" parameter estimates
estTable <- as.data.frame(summary(lm_model)$coefficients)
rownames(estTable)[2] <- "X"
estTable$True <- c(b0,b1)
estTable <- estTable %>%
select(True,Estimate,`Std. Error`)
estTable
# autocorrelation in residuals?
acf(resid(lm_model))
#------------- Define Template Model as .CPP file -----------------------------
cat("
/*---------------------- eiv_regression.cpp -- -----------------------------*/
#include <TMB.hpp>
using Eigen::SparseMatrix;
using namespace density;
template<class Type>
Type objective_function<Type>::operator() ()
{
// Data
// Note: Transformations of raw data are performed in R
DATA_VECTOR(Y); // Response
DATA_VECTOR(X); // Predictor
DATA_SCALAR(lambda) // ratio sigma_X/sigma_Y as known
// Book keeping
size_t N = Y.size(); // number of observations
// NOTE: use size_t for array indices
// Parameters
PARAMETER_VECTOR(beta); // coefficients (intercept, slope)
PARAMETER(log_sigma_Y); // log sd for measurement errors of Y
// logged to force positive values
PARAMETER_VECTOR(del); // vector of random observation errors for
// predictor sigma_X
// Transform parameters to desired values for use and reporting
Type sigma_Y = exp(log_sigma_Y); // inverse log
Type sigma_X = sigma_Y * lambda;
// Initialize negative log likelihood (to minimize as optimization goal)
Type nll = 0.;
// X error: likelihood contribution of observation error in X
nll -= sum(dnorm(del, Type(0), sigma_X, true));
// PROCESS: predictions calculated as model (with predictor error)
// Note: TMB array indices start at 0, R arrays start at 1
vector<Type> Y_hat(N);
for (size_t n=0; n<N; n++){ // loop through all obs
Y_hat(n) = beta(0) + beta(1)*(X(n) - del(n));
}
// OBSERVATION: likelihood contribution of observed data calculated as
// predictions + observation errors. e_i ~ N(0,sigma_Y)
for (size_t n=0; n<N; n++){ // loop through all obs
nll -= dnorm(Y(n), Y_hat(n), sigma_Y, true);
}
// Parameters to report to R, including Std. Error estimates based
// on the Hessian and Delta Method
ADREPORT(beta); // regression coefficients
ADREPORT(sigma_Y); // sd estimate Y
ADREPORT(sigma_X); // sd estimate X
ADREPORT(Y_hat); // predicted values and their st.dev.
return nll; // return the negative log likelihood for the
// specific set of parameter values.
}
/*---------------------------------------------------------------------------*/
",file="eiv_regression.cpp")
#------------- Estimate model parameters via TMB ------------------------------
#
# Remove previous object file and DLL if present
if (file.exists(paste0(model_name, ".o"))) {
file.remove(paste0(model_name,".o"))
}
if (file.exists(paste0(model_name, ".dll"))) {
file.remove(paste0(model_name,".dll"))
}
#
# Compile Model
compile(paste0(model_name, ".cpp"))
#
# Load compiled model
dyn.load(dynlib(model_name))
#
# Define data list
Data <- list(Y=as.vector(Y_obs), # Observed Response
X=as.vector(X_obs), # Observed Predictor
lambda=lambda) # sigma_X/sigma_Y assumption
# Define parameter list with initial values
Params <- list(beta=c(0,0), # Regression coefficients
log_sigma_Y=0, # Y Observation Error (log)
del=rep(0,N)) # errors in X
# Construct TMB function for optimization
Obj <- MakeADFun(data=Data,
parameters=Params,
DLL=model_name,
silent=T)
Obj$env$tracemgc <- FALSE
Obj$env$inner.control$trace <- FALSE
# Perform Optimization
Opt <- nlminb(start=Obj$par, objective=Obj$fn, gradient=Obj$gr,
control=list(iter.max=1000, eval.max=500))
if(Opt$convergence != 0){
print("Model had partial or non_convergence!")
}
#
# Likelihood profiles to assess estimation performance
par(mfrow=c(3,1))
nEstPar <- 3
for (i in 1:nEstPar) {
plot(tmbprofile(Obj,i, trace=0))
}
#
# Produce Report
Report <- sdreport(Obj)
ReportTable <- as.data.frame(summary(Report,"report")[1:4,])
rownames(ReportTable)[1:2] <- c("intercept","slope")
# Add CIs
ReportTable$CI95low <- ReportTable[,1] - ReportTable[,2]*1.96
ReportTable$CI95high <- ReportTable[,1] + ReportTable[,2]*1.96
# Add true values
ReportTable$True <- c(b0,b1,sigma_Y,lambda*sigma_Y)
# Add OLS values
OLS <- summary(lm(Y_obs~X_obs))
ReportTable$OLS <- c(OLS$coefficients[1:2,1],OLS$sigma,NA)
options(digits=3)
ReportTable
#
# Housekeeping: Unload model *.dll when done
dyn.unload(dynlib(model_name))
The "true"slope is 2. When I run this code I get an improvement fro the OLS slope estimate of 1.81, but it is still less than 2 (C.I. 95% = 1.92 to 1.98). I am not sure why this bias remains, and thought I would try to ask the community. I have placed my question in Stack Overflow as I suspect it is a coding error. Any suggestions are appreciated.