无法使用 MonetDB 和 MonetDBLite 在 R 中组合大型调查数据帧
Unable to combine large survey dataframes in R using MonetDB and MonetDBLite
我正在尝试连接或绑定 12 年的调查数据,每个调查数据都有几百万个观察值和一百个左右的变量,以便在考虑调查结构的同时分析年度趋势。最近发布的适用于 R 的 MonetDBLite 似乎是为这个大型数据集实施基于列的 DBMS 的绝佳机会。不过,我在合并数据集时遇到了困难。
简而言之,使用 plyr::r.bind.fill 来 DBI:dbWriteTable returns 截断 table。 r.bind.fill 是理想的,因为数据集的变量每年都不同。使用 dplyr::rbind_all 似乎根本不起作用。一种不太有用的方法(但仍然接受 table)涉及删除列以使 table 相同并使用直接的 UNION ALL SQL 操作。但是这个returns语法错误。
我对 dplyr 和 MonetDB 很陌生,所以这肯定是由于我自己的无知,但我已经花了几天时间在网上搜索但没有运气,所以任何帮助将不胜感激。
以下是一个可重现的示例。
干杯
查尔斯
# install.packages("MonetDB.R", repos="http://dev.monetdb.org/Assets/R/")
# install.packages("MonetDBLite", repos="http://dev.monetdb.org/Assets/R/")
library(MonetDBLite)
library(MonetDB.R)
library(dplyr)
library(plyr)
library(zoo)
mtcars1<-mtcars[,-11] # create 2 slightly different versions of mtcars
mtcars2<-mtcars[,-10]
mtcars1<-coredata(mtcars1)[rep(seq(nrow(mtcars1)),200000),] # create large versions of those dataframes
mtcars2<-coredata(mtcars2)[rep(seq(nrow(mtcars2)),200000),]
dbdir <- tempdir() # create a temporary directory
con <- dbConnect(MonetDB.R(), embedded=dbdir) # use DBI to connect to MonetDB
dbWriteTable(con, "mtcars1", mtcars1) # write the dataframes to column-based MonetDB tables
dbWriteTable(con, "mtcars2", mtcars2)
dbListTables(con)
ms <- src_monetdb(embedded=dbdir) # create a dplyr::tbl version tables
mt1 <- tbl(ms, "mtcars1")
mt2<-tbl(ms, "mtcars2")
# try plyr::rbind.fill to concatenate tables
dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2)))
# Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.
dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2 " )
# L1
# 1 1e+05
dbRemoveTable(con, "mt_1_2") # remove table to re-try
# try dbFetch(res, n=-1) to retrieve all results
dbFetch(dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2))), n=-1)
# Error in (function (classes, fdef, mtable) :
# unable to find an inherited method for function ‘dbFetch’ for signature ‘"logical", "numeric"’
# In addition: Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.
dbRemoveTable(con, "mt_1_2") # remove table to re-try
dbListFields(con, "mtcars1") # remove fields to make table columns identical
dbListFields(con, "mtcars2")
dbGetQuery(con, "
ALTER TABLE mtcars1
DROP COLUMN gear
")
dbGetQuery(con, "
ALTER TABLE mtcars2
DROP COLUMN carb
")
dbGetQuery(con,
"CREATE TABLE mt_1_2 WITH
Select * FROM mtcars1
UNION ALL
Select * FROM mtcars2")
# Error in .local(conn, statement, ...) :
# Unable to execute statement 'CREATE TABLE mt_1_2 AS
# Select * FROM mtcars1
# UNION ALL
# Select * FROM mtcars2'.
# Server says 'syntax error, unexpected SCOLON, expecting WITH in: "create table mt_1_2 as
# select * from mtcars1
# union all
# select * from mtcars2"
# ' [#42000].
您可以坚持使用 dplyr
并使用 rbind_list
library(MonetDB.R)
library(MonetDBLite)
library(dplyr)
mtcars1 <- mtcars[, -11] # create 2 slightly different versions of mtcars
mtcars2 <- mtcars[, -10]
## Reduce size
mtcars1 <- mtcars1[rep(seq(nrow(mtcars1)), 10000), ]
mtcars2 <- mtcars2[rep(seq(nrow(mtcars2)), 10000), ]
### Check size
nrow(mtcars1)
## [1] 320000
nrow(mtcars2)
## [1] 320000
###
dbdir <- tempdir() ## create a temporary directory
con <- dbConnect(MonetDB.R(), embedded = dbdir)
###
dbWriteTable(con, name = "mtcars1", value = mtcars1)
dbWriteTable(con, name = "mtcars2", value = mtcars2)
dbListTables(con)
###
ms <- src_monetdb(embedded = dbdir) # create a dplyr::tbl version tables
mt1 <- tbl(ms, "mtcars1")
mt2 <- tbl(ms, "mtcars2")
### You need to add `n = -1` to `as.data.frame` to retrieve all rows
dbWriteTable(con, "mt_1_2", rbind_list(as.data.frame(mt1, n = -1),
as.data.frame(mt2, n = -1)))
###
dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2")
## L1
## 1 640000
我正在尝试连接或绑定 12 年的调查数据,每个调查数据都有几百万个观察值和一百个左右的变量,以便在考虑调查结构的同时分析年度趋势。最近发布的适用于 R 的 MonetDBLite 似乎是为这个大型数据集实施基于列的 DBMS 的绝佳机会。不过,我在合并数据集时遇到了困难。
简而言之,使用 plyr::r.bind.fill 来 DBI:dbWriteTable returns 截断 table。 r.bind.fill 是理想的,因为数据集的变量每年都不同。使用 dplyr::rbind_all 似乎根本不起作用。一种不太有用的方法(但仍然接受 table)涉及删除列以使 table 相同并使用直接的 UNION ALL SQL 操作。但是这个returns语法错误。
我对 dplyr 和 MonetDB 很陌生,所以这肯定是由于我自己的无知,但我已经花了几天时间在网上搜索但没有运气,所以任何帮助将不胜感激。
以下是一个可重现的示例。
干杯
查尔斯
# install.packages("MonetDB.R", repos="http://dev.monetdb.org/Assets/R/")
# install.packages("MonetDBLite", repos="http://dev.monetdb.org/Assets/R/")
library(MonetDBLite)
library(MonetDB.R)
library(dplyr)
library(plyr)
library(zoo)
mtcars1<-mtcars[,-11] # create 2 slightly different versions of mtcars
mtcars2<-mtcars[,-10]
mtcars1<-coredata(mtcars1)[rep(seq(nrow(mtcars1)),200000),] # create large versions of those dataframes
mtcars2<-coredata(mtcars2)[rep(seq(nrow(mtcars2)),200000),]
dbdir <- tempdir() # create a temporary directory
con <- dbConnect(MonetDB.R(), embedded=dbdir) # use DBI to connect to MonetDB
dbWriteTable(con, "mtcars1", mtcars1) # write the dataframes to column-based MonetDB tables
dbWriteTable(con, "mtcars2", mtcars2)
dbListTables(con)
ms <- src_monetdb(embedded=dbdir) # create a dplyr::tbl version tables
mt1 <- tbl(ms, "mtcars1")
mt2<-tbl(ms, "mtcars2")
# try plyr::rbind.fill to concatenate tables
dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2)))
# Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.
dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2 " )
# L1
# 1 1e+05
dbRemoveTable(con, "mt_1_2") # remove table to re-try
# try dbFetch(res, n=-1) to retrieve all results
dbFetch(dbWriteTable(con, "mt_1_2", rbind.fill(as.data.frame(mt1, mt2))), n=-1)
# Error in (function (classes, fdef, mtable) :
# unable to find an inherited method for function ‘dbFetch’ for signature ‘"logical", "numeric"’
# In addition: Warning message:
# Only first 6,400,000 results retrieved. Use n = -1 to retrieve all.
dbRemoveTable(con, "mt_1_2") # remove table to re-try
dbListFields(con, "mtcars1") # remove fields to make table columns identical
dbListFields(con, "mtcars2")
dbGetQuery(con, "
ALTER TABLE mtcars1
DROP COLUMN gear
")
dbGetQuery(con, "
ALTER TABLE mtcars2
DROP COLUMN carb
")
dbGetQuery(con,
"CREATE TABLE mt_1_2 WITH
Select * FROM mtcars1
UNION ALL
Select * FROM mtcars2")
# Error in .local(conn, statement, ...) :
# Unable to execute statement 'CREATE TABLE mt_1_2 AS
# Select * FROM mtcars1
# UNION ALL
# Select * FROM mtcars2'.
# Server says 'syntax error, unexpected SCOLON, expecting WITH in: "create table mt_1_2 as
# select * from mtcars1
# union all
# select * from mtcars2"
# ' [#42000].
您可以坚持使用 dplyr
并使用 rbind_list
library(MonetDB.R)
library(MonetDBLite)
library(dplyr)
mtcars1 <- mtcars[, -11] # create 2 slightly different versions of mtcars
mtcars2 <- mtcars[, -10]
## Reduce size
mtcars1 <- mtcars1[rep(seq(nrow(mtcars1)), 10000), ]
mtcars2 <- mtcars2[rep(seq(nrow(mtcars2)), 10000), ]
### Check size
nrow(mtcars1)
## [1] 320000
nrow(mtcars2)
## [1] 320000
###
dbdir <- tempdir() ## create a temporary directory
con <- dbConnect(MonetDB.R(), embedded = dbdir)
###
dbWriteTable(con, name = "mtcars1", value = mtcars1)
dbWriteTable(con, name = "mtcars2", value = mtcars2)
dbListTables(con)
###
ms <- src_monetdb(embedded = dbdir) # create a dplyr::tbl version tables
mt1 <- tbl(ms, "mtcars1")
mt2 <- tbl(ms, "mtcars2")
### You need to add `n = -1` to `as.data.frame` to retrieve all rows
dbWriteTable(con, "mt_1_2", rbind_list(as.data.frame(mt1, n = -1),
as.data.frame(mt2, n = -1)))
###
dbGetQuery(con, "SELECT COUNT(*) FROM mt_1_2")
## L1
## 1 640000