Plot Spline in R

84 Views Asked by At

I want to plot a spline with R. But I have the error message "'newdata' has 38 rows, but the variables found have 700 rows" I am not sure why a spline does has so many rows. It has indeed to many.

My professor told me that I should work with the functions lm() and bs(). That is why I am using them.

I hope for your help :-)

Bonus-Question: Can I plot 3D how all three variables: sales, visits and discount are connected with each other?

--

This here is my code. I hope that I am not missing anything. I am still missing the concept how to make out of a spline-model (spline_model) a line. And why do I need to predict the data again (ChatGPT told me that and some sources in the internet) with seq and pred. Why can I not use the normal spline_model for plotting?

library(dplyr) #Datenmanipulation
library(DataExplorer) #DatenExplore
library(skimr) #Datenüberblick
library(lmtest) #Regression
library(olsrr)
library(jtools)
library(moments)
library(highcharter) #Diagramme keine gewerbliche Nutzung
library(ggstatsplot)
library(ggplot2)
library(graphics) 
library(mgcv) #spline
library(ggeffects) #GAM <- Spline
library(DiagrammeR) #Kausalitätsdiagramm
library(splines) #Splines

data_path <- "https://raw.githubusercontent.com/juanitorduz/website_projects/master/data/sales_dag.csv"
data <- read.csv(data_path)
data %>% select(visits, discount, sales) -> data_clean
spline_model <- lm(sales ~ bs(data_clean$visits, df = 3, knots = 3)+ data_clean$discount -1, data = data_clean) #spline

visits_range <- range(data_clean$visits) #MinMax der Var
visits_seq <- seq(from=visits_range[1], to=visits_range[2]) #Sequenz von Min nach Max neu
spline_model_pred<-predict(spline_model, newdata = data.frame(visits=visits_seq),se=T) #Vorhersage neu berechnen

plot(data_clean$visits, data_clean$sales, main="Beobachtete vs. Vorhergesagte Werte", xlab="Visits (IV)", ylab="Sales (DV)")
lines(visits_seq, spline_model_pred$fit, col = "red", lwd = 3)
3

There are 3 best solutions below

2
tt33tt On BEST ANSWER

Update. Everything solved with your help.

#Not every library is needed
library(dplyr) #Datenmanipulation
library(DataExplorer) #DatenExplore
library(skimr) #Datenüberblick
library(lmtest) #Regression
library(olsrr)
library(jtools)
library(moments)
library(highcharter) #Diagramme keine gewerbliche Nutzung
library(ggstatsplot)
library(ggplot2)
library(graphics) 
library(mgcv) #spline
library(ggeffects) #GAM <- Spline
library(DiagrammeR) #Kausalitätsdiagramm
library(splines) #Splines

# Laden der erforderlichen Bibliotheken für 3D
library(mgcv)
library(plot3D)
library(scatterplot3d)
library("rgl")
    
    # Spline-Modell erstellen
spline_model <- lm(sales ~ bs(visits, df = 3, knots = 3) + discount - 1, data = data_clean)

# Vorhersage für die Spline-Funktion generieren
visits_range <- range(data_clean$visits)
discount_range <- range(data_clean$discount)
visits_seq <- seq(from = visits_range[1], to = visits_range[2], length.out = 100)
discount_seq <- seq(from = discount_range[1], to = discount_range[2], length.out = 100)
visits_discount_grid <- expand.grid(visits = visits_seq, discount = discount_seq)
spline_model_pred <- predict(spline_model, newdata = visits_discount_grid, se = TRUE)

# 3D-Scatterplot der Rohdaten
plot3d(data_clean$visits, data_clean$discount, data_clean$sales, 
        xlab = "Visits", ylab = "Discount", zlab = "Sales",
        main = "3D-Modell der Spline-Funktion und Rohdaten", type = "s", col = "red", size = 2)

# Hinzufügen der Spline-Funktion als Drahtgitter
spline_surface <- matrix(spline_model_pred$fit, nrow = length(visits_seq), ncol = length(discount_seq), byrow = FALSE)
surface3d(visits_seq, discount_seq, spline_surface, col = "blue")

# Legende hinzufügen
legend3d("topright", legend = c("Spline-Funktion", "Rohdaten"), col = c("blue", "red"), pch = c(NA, 16), lty = c(1, NA), lwd = c(4, NA), cex = 2)

This is my current result

7
Dave2e On

Since you are using the "data" term inside the lm() function you need to simplify the expression:

#spline
spline_model <- lm(sales ~ bs(visits, df = 3, knots = 3)+ discount -1, data = data_clean) 

As rawr point out in his comments. The error in the predict() function was due to the fact that it was looking for the variable "data_clean$visits" which is the global environment and not the "visits" column in the "new_data" data frame.

For the prediction. From the raw data discount is either 0 or 1 thus you need to perform the prediction with both options.

spline_model_pred_0<-predict(spline_model, newdata = data.frame(visits=visits_seq, discount = 0),se=T) 
spline_model_pred_1<-predict(spline_model, newdata = data.frame(visits=visits_seq, discount = 1),se=T)

lines(y=spline_model_pred_0$fit, x=visits_seq, col="blue")
lines(y=spline_model_pred_1$fit, x=visits_seq, col="red")

enter image description here

0
Daniel On

For the predicted values, you can simply use predict_response() from the ggeffects package:

library(splines)
library(ggeffects)

data_path <- "https://raw.githubusercontent.com/juanitorduz/website_projects/master/data/sales_dag.csv"
data <- datawizard::data_read(data_path)
data_clean <- data |> datawizard::data_select(c("visits", "discount", "sales"))
spline_model <- lm(sales ~ bs(visits, df = 3, knots = 3) + discount - 1, data = data_clean) # spline

predict_response(spline_model, c("visits", "discount")) |> plot(show_data = TRUE)
#> Data points may overlap. Use the `jitter` argument to add some amount of
#>   random variation to the location of data points and avoid overplotting.

Created on 2024-03-28 with reprex v2.1.0