I keep getting a message that I my data has tons of missing values although I have manually verified this not to be the case! as evident in the script I have been trying all the debugging I know (I am not particularly experienced in R) and I am at my wits end with this script! It's pretty long and ugly, but I kept the debugging comments and logging in case it might be of use. Might be a fun challange for a R-wiz or something, tbh idk, but I would greatly appreciate the help! I will add relevant information in the following order:

  1. R-script
  2. Info on my data
  3. error code output on my last run

Sorry about the absolute mess this is, I just don't know R enough to know what someone might need to understand what on earth is going on here! I greatly appreciate any help and insight into this!

In case anyone feels the immeadiate need to critique the validity of the forecasting method, I know and agree, but I needed absolute automation, besides this (with some checks after the model estimations) is sufficient for my use, it's not going in an paper, it's an alternative to the method they used previously at my job, which was just an average, not rolling average, not even season or trend corrected average, just an average average, say what you will, but this is better than the average average xD

R-script:


library(forecast)
library(tseries)
library(ggplot2)
library(zoo)
library(readxl)
library(openxlsx)


# Define the Excel file path
excel_file <- "hardcoded correct full path using /'s"

# Load the data from the Excel file
data <- read_excel(excel_file)

print(head(data))

# Define variables
data$date_var <- as.Date(paste(data$aar, data$periode, "01", sep = "-"), format = "%Y-%m-%d")

# Debug: Check the date_var format
print("Debug: Head of date_var")
print(head(data$date_var))

# Define which variables are to be forecasted and which are exogenous
forecast_vars <- c("sho_dogn", "sho_dag", "sho_polk", "antall_liggedogn", "totale_sho", "drg_dogn", "drg_dag", "drg_polk", "drg_totalt")
exog_vars <- c("nedstengt", "dato", "trend", "cortrend")

# Data Preprocessing: Insert lags for all forecast_vars
for (var in forecast_vars) {
  for (lag in 1:3) {
    new_var_name <- paste(var, "lag", lag, sep = "_")
    lagged_data <- lag(zoo(data[, var]), -lag)
    lagged_data_filled <- c(rep(NA, lag), coredata(lagged_data))
    data[, new_var_name] <- lagged_data_filled
  }
}

print(head(data))

# Check for missing data
if (any(is.na(data))) {
  print("Warning: Missing data found in the dataset.")
}
print(sum(is.na(data)))

# Data Split: Create train, bench, and future sets
# Training set: From April 2019 to the end of 2022
train <- data[(data$aar < 2023 & data$aar > 2019) | 
              (data$aar == 2019 & data$periode >= 4), ]

# Benching set: From period 1 to 9 in the year 2023
bench <- data[data$aar == 2023 & data$periode < 10, ]

# Future set: From period 10 in 2023 to the end of 2024
future_data <- data[(data$aar == 2023 & data$periode >= 10) | 
                    (data$aar == 2024), ]

# Debug: Check if train, bench, and future sets are created as expected
print("Debug: Head of the train set")
print(head(train))
print("Debug: Head of the bench set")
print(head(bench))
print("Debug: Head of the future set")
print(head(future_data))

log_msg("Starting loop through forecast variables.")
# Loop through each forecast variable
for (var in forecast_vars) {

  # Initialize lists for models and performance metrics
  models <- list()
  performances <- list()
  
  # Create a list of exogenous variables specific to this forecast variable
  # Include only the lags of the current forecast variable and the general exogenous variables
  var_lags <- grep(paste0("^", var, "_lag_"), names(train), value = TRUE)
  specific_exog_vars <- c(var_lags, exog_vars)
  
  # Remove the unnecessary 'dato' variable
  specific_exog_vars <- setdiff(specific_exog_vars, "dato")
  
  log_msg("Running first auto.arima.")
  # First run of auto.arima
  model <- auto.arima(ts(train[, var]), xreg = as.matrix(train[, specific_exog_vars]), 
                      ic = "aic", seasonal = TRUE, stepwise = TRUE, trace = FALSE)
  
  }
  print(models)

  # Remove duplicate models
  unique_models <- unique(sapply(models, function(x) capture.output(print(x))))
  models <- models[match(unique_models, sapply(models, function(x) capture.output(print(x))))]

  # Debug: Print unique models
  print("Debug: Unique models")
  print(models)

  # Continue only if the list of unique models is less than 3
  if (length(unique_models) < 3) {
    while (length(models) < 3) {
      min_p <- min(sapply(models, function(x) x$arma[1]))
      min_q <- min(sapply(models, function(x) x$arma[2]))
      
      log_msg("Starting model_q refinement.")
      for (ic in c("aic", "bic")) {
        model_q <- auto.arima(ts(train[, var]), xreg = as.matrix(train[, specific_exog_vars]), max.q = min_q - 1, 
                              ic = ic, seasonal = TRUE, stepwise = TRUE, trace = FALSE)
        log_msg("Completed model_q refinement.")
        log_msg("Starting model_p refinement.")
        model_p <- auto.arima(ts(train[, var]), xreg = as.matrix(train[, specific_exog_vars]), max.p = min_p - 1, 
                              ic = ic, seasonal = TRUE, stepwise = TRUE, trace = FALSE)
        log_msg("Completed model_p refinement.")

        # Fallback if model fitting fails
        if (is.null(model_q$coef) || is.null(model_p$coef)) {
        log_msg("model fitting failed! if (is.null(model_q$coef) || is.null(model_p$coef))")
          print("Model fitting failed.")
          next
        } else {
          bic_q <- BIC(model_q)
          bic_p <- BIC(model_p)
        }
        
        # Choose the better model based on BIC
        if (bic_q < bic_p) {
          tempmodels <- c(tempmodels, list(model_q))
        } else {
          tempmodels <- c(tempmodels, list(model_p))
        }
      }
      log_msg("Selected model base on BIC.")
      # Update the models list
      models <- c(models, tempmodels)
      
      # Clear tempmodels for the next iteration
      tempmodels <- list()
    }
  }
  
  1. Data enter image description here (there is data back to 01.01.2019, it contains valid data in valid formats for all vars) f_vars are the variables to be forecasted (f_var 5 is just about perfectly correlated with 1-3, same for 6-8 and 9)

  2. interface output:

> library(forecast)
> library(tseries)
> library(ggplot2)
> library(zoo)
> library(readxl)
> library(openxlsx)

> print(head(data))
# A tibble: 6 × 16
  seksjon month   year f_var1 f_var2 f_var3 f_var4  f_var5
  <chr>     <dbl> <dbl>    <dbl>   <dbl>    <dbl>            <dbl>      <dbl>
1 BAR           1  2019      478     681     3249             3506       4408
2 BAR           2  2019      425     530     2852             3458       3807
3 BAR           3  2019      430     614     3204             2731       4248
4 BAR           4  2019      386     458     2932             3061       3776
5 BAR           5  2019      464     627     3154             3228       4245
6 BAR           6  2019      436     511     3009             2634       3956
# ℹ 8 more variables: drg_dogn <dbl>, drg_dag <dbl>, drg_polk <dbl>,
#   drg_totalt <dbl>, nedstengt <dbl>, trend <dbl>, cortrend <dbl>, date <chr>

> print(head(data$date_var))
[1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
[6] "2019-06-01"
> 
> # Define which variables are to be forecasted and which are exogenous
> forecast_vars <- c("sho_dogn", "sho_dag", "sho_polk", "antall_liggedogn", "totale_sho", "drg_dogn", "drg_dag", "drg_polk", "drg_totalt")
> exog_vars <- c("nedstengt", "dato", "trend", "cortrend")

> print(head(data))
# A tibble: 6 × 44
  seksjon periode   aar sho_dogn sho_dag sho_polk antall_liggedogn totale_sho
  <chr>     <dbl> <dbl>    <dbl>   <dbl>    <dbl>            <dbl>      <dbl>
1 BAR           1  2019      478     681     3249             3506       4408
2 BAR           2  2019      425     530     2852             3458       3807
3 BAR           3  2019      430     614     3204             2731       4248
4 BAR           4  2019      386     458     2932             3061       3776
5 BAR           5  2019      464     627     3154             3228       4245
6 BAR           6  2019      436     511     3009             2634       3956
# ℹ 36 more variables: drg_dogn <dbl>, drg_dag <dbl>, drg_polk <dbl>,
#   drg_totalt <dbl>, nedstengt <dbl>, trend <dbl>, cortrend <dbl>, date <chr>,
#   date_var <date>, sho_dogn_lag_1 <dbl>, sho_dogn_lag_2 <dbl>,
#   sho_dogn_lag_3 <dbl>, sho_dag_lag_1 <dbl>, sho_dag_lag_2 <dbl>,
#   sho_dag_lag_3 <dbl>, sho_polk_lag_1 <dbl>, sho_polk_lag_2 <dbl>,
#   sho_polk_lag_3 <dbl>, antall_liggedogn_lag_1 <dbl>,
#   antall_liggedogn_lag_2 <dbl>, antall_liggedogn_lag_3 <dbl>, …
> 


#|| NOW THIS RIGHT HERE IS WHAT REALLY THROWS ME, WHERE ARE ALL THE MISSING #|| VALUES FROM??


[1] "Warning: Missing data found in the dataset."

> print(sum(is.na(data)))
[1] 540

> print(head(train))
# A tibble: 6 × 44
  seksjon periode   aar sho_dogn sho_dag sho_polk antall_liggedogn totale_sho
  <chr>     <dbl> <dbl>    <dbl>   <dbl>    <dbl>            <dbl>      <dbl>
1 BAR           4  2019      386     458     2932             3061       3776
2 BAR           5  2019      464     627     3154             3228       4245
3 BAR           6  2019      436     511     3009             2634       3956
4 BAR           7  2019      320     259     1133             2756       1712
5 BAR           8  2019      379     528     2475             2779       3382
6 BAR           9  2019      416     570     3122             2974       4108
# ℹ 36 more variables: drg_dogn <dbl>, drg_dag <dbl>, drg_polk <dbl>,
#   drg_totalt <dbl>, nedstengt <dbl>, trend <dbl>, cortrend <dbl>, date <chr>,
#   date_var <date>, sho_dogn_lag_1 <dbl>, sho_dogn_lag_2 <dbl>,
#   sho_dogn_lag_3 <dbl>, sho_dag_lag_1 <dbl>, sho_dag_lag_2 <dbl>,
#   sho_dag_lag_3 <dbl>, sho_polk_lag_1 <dbl>, sho_polk_lag_2 <dbl>,
#   sho_polk_lag_3 <dbl>, antall_liggedogn_lag_1 <dbl>,
#   antall_liggedogn_lag_2 <dbl>, antall_liggedogn_lag_3 <dbl>, …
> print("Debug: Head of the bench set")
[1] "Debug: Head of the bench set"
> print(head(bench))
# A tibble: 6 × 44
  seksjon periode   aar sho_dogn sho_dag sho_polk antall_liggedogn totale_sho
  <chr>     <dbl> <dbl>    <dbl>   <dbl>    <dbl>            <dbl>      <dbl>
1 BAR           1  2023      394     619     3805             2458       4818
2 BAR           2  2023      375     538     3369             2286       4282
3 BAR           3  2023      421     645     4044             3283       5110
4 BAR           4  2023      334     442     2821             2527       3597
5 BAR           5  2023      373     552     3338             2826       4263
6 BAR           6  2023      380     682     3674             3014       4736
# ℹ 36 more variables: drg_dogn <dbl>, drg_dag <dbl>, drg_polk <dbl>,
#   drg_totalt <dbl>, nedstengt <dbl>, trend <dbl>, cortrend <dbl>, date <chr>,
#   date_var <date>, sho_dogn_lag_1 <dbl>, sho_dogn_lag_2 <dbl>,
#   sho_dogn_lag_3 <dbl>, sho_dag_lag_1 <dbl>, sho_dag_lag_2 <dbl>,
#   sho_dag_lag_3 <dbl>, sho_polk_lag_1 <dbl>, sho_polk_lag_2 <dbl>,
#   sho_polk_lag_3 <dbl>, antall_liggedogn_lag_1 <dbl>,
#   antall_liggedogn_lag_2 <dbl>, antall_liggedogn_lag_3 <dbl>, …
> print("Debug: Head of the future set")
[1] "Debug: Head of the future set"
> print(head(future_data))
# A tibble: 6 × 44
  seksjon periode   aar sho_dogn sho_dag sho_polk antall_liggedogn totale_sho
  <chr>     <dbl> <dbl>    <dbl>   <dbl>    <dbl>            <dbl>      <dbl>
1 BAR          10  2023       NA      NA       NA               NA         NA
2 BAR          11  2023       NA      NA       NA               NA         NA
3 BAR          12  2023       NA      NA       NA               NA         NA
4 BAR           1  2024       NA      NA       NA               NA         NA
5 BAR           2  2024       NA      NA       NA               NA         NA
6 BAR           3  2024       NA      NA       NA               NA         NA
# ℹ 36 more variables: drg_dogn <dbl>, drg_dag <dbl>, drg_polk <dbl>,
#   drg_totalt <dbl>, nedstengt <dbl>, trend <dbl>, cortrend <dbl>, date <chr>,
#   date_var <date>, sho_dogn_lag_1 <dbl>, sho_dogn_lag_2 <dbl>,
#   sho_dogn_lag_3 <dbl>, sho_dag_lag_1 <dbl>, sho_dag_lag_2 <dbl>,
#   sho_dag_lag_3 <dbl>, sho_polk_lag_1 <dbl>, sho_polk_lag_2 <dbl>,
#   sho_polk_lag_3 <dbl>, antall_liggedogn_lag_1 <dbl>,
#   antall_liggedogn_lag_2 <dbl>, antall_liggedogn_lag_3 <dbl>, …

>   print(models)
list()

[1] "Debug: Unique models"
>   print(models)
list()

the problem leads to no models being produced and I simply just cannot for the life of me understand why, anyone got any ideas?
if you need anything from me please just ask, I will facilitate to the best of my abilities!

expected excel files with predicted values, got error messages with absurd claims of missing data!

0

There are 0 best solutions below