I am using RSQLite
, DBI
, dbplyr
, and sqldf
packages.
Here are the packages:
library(dbplyr)
library(RSQLite)
library(DBI)
library(sqldf)
library(tidyverse)
First I'm using mtcars
dataset with the rownames included.
mtcars <- tibble::as_tibble(mtcars, rownames = 'car')
I also created my own table which I combined with a left_join.
mtcars %>% dplyr::left_join(cars_origin_tbl, by = 'car') -> mtcars
car <- c("Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", "Volvo 142E")
origin <- c("Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "United States", "United States", "United States", "Italy", "Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Italy", "Germany", "British", "United States", "Italy", "Italy", "Sweden")
cars_origin_tbl <- tibble(car, origin)
Now I establish the connection to SQLite. I create a table using dplyr
methods and I see what that looks like as SQL.
sql_mtcars <- mtcars
con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)
tbl(con, "sql_mtcars") %>%
dplyr::select(car, origin, mpg, disp, hp, drat, wt, qsec) %>%
pivot_longer(names_to = 'names', values_to = 'values', 4:8) %>%
group_by(names) %>%
summarize(sd = sd(values)) %>%
dplyr::show_query()
Here is what that table would look like if I used a dataframe and did not use show_query:
And here is what the SQL looks like from the show_query
function:
I can copy and paste that query into either dbGetQuery or sqldf and in this case they both work the same.
DBI::dbGetQuery(con, "
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
")
sqldf("
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
")
Using dbGetQuery
or sqldf
gives me back the same table that I included a picture of, above.
Ok, but now I go on to a totally query with the same workflow.
con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)
dbListTables(con)
tbl(con, "sql_mtcars") %>%
group_by(origin) %>%
summarize(mean = round(mean(mpg), 1),
sd = round(sd(mpg),1),
skew = round(3 * ( ( mean(mpg) - median(mpg) ) / sd(mpg) ),1),
kurtosis = round(sum((mpg - mean(mpg)) ^ 4) / 32 / (var(mpg) ^ (2)),1)) %>%
arrange(desc(mean)) %>%
dplyr::show_query()
Here is what that table would look like if I used a dataframe and did not use show_query
:
In this case I can see the SQL from show_query()
But I can't make it compile with either dbGetQuery
or sqldf
.
DBI::dbGetQuery(con, "
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
")
sqldf("
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
")
In both cases, I get this error message:
Error: misuse of aggregate function AVG()
I'm not sure what the issue is. I think it might be one of three general problems:
- Maybe the SQL query form
show_query()
incorrect in some way? I just copied and pasted what arrived in my console back into the function. - Maybe
dbGetQuery
andsqldf
have bugs in them, and aren't perfect? - Maybe it has something to do with
NA
values in this second dataframe?