R and SQL - misuse of aggregate function AVG() but SQL inconsistent backwards and forwards

97 Views Asked by At

I am using RSQLite, DBI, dbplyr, and sqldf packages.

Here are the packages:

library(dbplyr)
library(RSQLite)
library(DBI)
library(sqldf)
library(tidyverse)

First I'm using mtcars dataset with the rownames included.

mtcars <- tibble::as_tibble(mtcars, rownames = 'car')  

I also created my own table which I combined with a left_join.

mtcars %>% dplyr::left_join(cars_origin_tbl, by = 'car') -> mtcars

car <- c("Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", "Volvo 142E")

origin <- c("Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "United States", "United States", "United States", "Italy", "Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Italy", "Germany", "British", "United States", "Italy", "Italy", "Sweden")

cars_origin_tbl <- tibble(car, origin)

Now I establish the connection to SQLite. I create a table using dplyr methods and I see what that looks like as SQL.

sql_mtcars <- mtcars

con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)

tbl(con, "sql_mtcars") %>%
  dplyr::select(car, origin, mpg, disp, hp, drat, wt, qsec) %>%
  pivot_longer(names_to = 'names', values_to = 'values', 4:8) %>%
  group_by(names) %>%
  summarize(sd = sd(values)) %>%
  dplyr::show_query()

Here is what that table would look like if I used a dataframe and did not use show_query:

enter image description here

And here is what the SQL looks like from the show_query function:

enter image description here

I can copy and paste that query into either dbGetQuery or sqldf and in this case they both work the same.

DBI::dbGetQuery(con, "
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
  ")

sqldf("
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
      ")

Using dbGetQuery or sqldf gives me back the same table that I included a picture of, above.

Ok, but now I go on to a totally query with the same workflow.

con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)

dbListTables(con)

tbl(con, "sql_mtcars") %>%
  group_by(origin) %>%
  summarize(mean = round(mean(mpg), 1), 
            sd = round(sd(mpg),1),
            skew = round(3 * ( ( mean(mpg) - median(mpg) )  / sd(mpg) ),1),
            kurtosis = round(sum((mpg - mean(mpg)) ^ 4) / 32 / (var(mpg) ^ (2)),1)) %>%
  arrange(desc(mean)) %>%
  dplyr::show_query()

Here is what that table would look like if I used a dataframe and did not use show_query:

enter image description here

In this case I can see the SQL from show_query()

enter image description here

But I can't make it compile with either dbGetQuery or sqldf.

DBI::dbGetQuery(con, "
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
  ")

sqldf("
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
      ")

In both cases, I get this error message:

Error: misuse of aggregate function AVG()

I'm not sure what the issue is. I think it might be one of three general problems:

  • Maybe the SQL query form show_query() incorrect in some way? I just copied and pasted what arrived in my console back into the function.
  • Maybe dbGetQuery and sqldf have bugs in them, and aren't perfect?
  • Maybe it has something to do with NA values in this second dataframe?
0

There are 0 best solutions below