有效地在 R 中设置非常大的数据帧
Sub setting very large data frames in R efficiently
所以我有一个 16 列和约 1700 万行的数据框。
我想先对数据框做一些 ddply
,然后查看不同列之间的相关性。实现这一目标的最佳和最有效的方法是什么?我目前的方法耗时太长:
数据框是all_df
,列名是A
、B
、C
、...、N
、O
, P
avB <- ddply(all_df, c(“A”), summarise, NB_av=mean(B), NB_sd=sd(B))
avC <- ddply(all_df, c(“A”), summarise, NC_av=mean(C), NC_sd=sd(C))
avD <- ddply(all_df, c(“A”), summarise, ND_av=mean(D), ND_sd=sd(D))
avE <- ddply(all_df, c(“A”), summarise, NE_av=mean(E), NE_sd=sd(E))
avF <- ddply(all_df, c(“A”), summarise, NF_av=mean(F), NF_sd=sd(F))
avG <- ddply(all_df, c(“A”), summarise, NG_av=mean(G), NG_sd=sd(G))
summary_df <- avB
summary_df <- merge(summary_df, avC, by=c(“A”))
summary_df <- merge(summary_df, avD, by=c(“A”))
summary_df <- merge(summary_df, avE, by=c(“A”))
summary_df <- merge(summary_df, avF, by=c(“A”))
summary_df <- merge(summary_df, avG, by=c(“A”))
#quick look at the correlation
plot((summary_df[,c(2,4,6,8,10,12)]), gap=0)
所以,事实上,我决定在 MySQL 中做很多这些,平均值,标准差等,然后在 R 中做最终的相关分析。但是,我不觉得很优雅。
为什么我使用数据框而不是数据table?因为我正在将 MySQL table 读入 R,语法 dbGetQuery(con,"select * from mysql_table")
returns 一个数据框。
你可以试试
library(dplyr)
all_df %>%
group_by(A) %>%
summarise_each(funs(mean, sd), B:G)
或者另一种选择是data.table
library(data.table)
setDT(all_df)[, lapply(.SD, function(x) c(mean(x), sd(x))), by = A,
.SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][]
注意:第一种形式的结果是宽格式,而第二种形式的结果是 'mean'、'sd' 作为替代行。
基准
all_df1 <- all_df[rep(1:nrow(all_df), 1e5),]
system.time(all_df1%>% group_by(A) %>% summarise_each(funs(mean, sd), B:G))
# user system elapsed
# 0.189 0.000 0.189
DT1 <- as.data.table(all_df1)
system.time(DT1[,lapply(.SD, function(x) c(mean(x), sd(x))),
A, .SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][])
# user system elapsed
#0.232 0.002 0.235
数据
set.seed(25)
m1 <- matrix(sample(1:20, 15*20, replace=TRUE), ncol=15)
set.seed(353)
all_df <- data.frame(sample(letters[1:3], 20, replace=TRUE), m1)
colnames(d1) <- LETTERS[1:ncol(d1)]
非常感谢阿克伦!
我根据您的回答编写了一个完整的示例,该示例也借鉴了 http://www.carlboettiger.info/2012/02/12/elegant-fast-data-manipulation-with-data-table.html 它还展示了如何调用由 lapply 生成的对象的特定元素。
#create a super large data frame:
grpsize = ceiling(1e7/26^2)
all_df1 <- data.frame(
x=rep(LETTERS,each=26*grpsize),
y=rep(letters,each=grpsize),
v=runif(grpsize*26^2),
v2=runif(grpsize*26^2),
stringsAsFactors=FALSE)
#to group by x and y andget length, mean from data frame
sumalldf <- ddply(all_df1, c("x","y"), summarise, ntotalldf = length(x), nmeanalldf = mean(v))
#convert to data.table
#more efficient way:
DT1 <- data.table(all_df1)
##less efficient way:
DT2 <- as.data.table(all_df1)
#set keys on x,y columns
setkey(DT1,x,y) # for x only, use: setkey(DT,x)
#setting the key as above allows calling by column value : DT1["A"]
#if you don't setkey and attempt the above, you'll get an error warning you to set key
#take a look at DT1
print(head(DT1))
print(tail(DT1))
#now group data table by x,y and get mean and standard deviation for all other columns
sumalldt <- DT1[,lapply(.SD, function(x) c(mean(x), sd(x))), by= list(x,y)][,var:= c('mean', 'sd')][]
#.SD stands for subset of data, in lay words it applies function (mean, sd) to all columns
#except the by columns
#take a look at this new object that holds the
#mean and standard deviation for all other columns
#after grouping by x,y
print(head(sumalldt))
print(tail(sumalldt))
#the keys for sumalldt get set by the 'by' components in lapply
print("some key, attributes etc for sumalldt")
print(key(sumalldt))
print(haskey(sumalldt))
#to get all values for x=B
#sumalldt["B"]
#to get all values for y=r
#sumalldt[list(unique(x),'r')]
#or
#sumalldt[y=="r"] # the former is more efficient
#say then you want to get the values only of x=B, y=r
print(paste("values for x=B, y=r"))
print(sumalldt[list('B','r')])
print("only the mean")
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v)
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v2)
所以我有一个 16 列和约 1700 万行的数据框。
我想先对数据框做一些 ddply
,然后查看不同列之间的相关性。实现这一目标的最佳和最有效的方法是什么?我目前的方法耗时太长:
数据框是all_df
,列名是A
、B
、C
、...、N
、O
, P
avB <- ddply(all_df, c(“A”), summarise, NB_av=mean(B), NB_sd=sd(B))
avC <- ddply(all_df, c(“A”), summarise, NC_av=mean(C), NC_sd=sd(C))
avD <- ddply(all_df, c(“A”), summarise, ND_av=mean(D), ND_sd=sd(D))
avE <- ddply(all_df, c(“A”), summarise, NE_av=mean(E), NE_sd=sd(E))
avF <- ddply(all_df, c(“A”), summarise, NF_av=mean(F), NF_sd=sd(F))
avG <- ddply(all_df, c(“A”), summarise, NG_av=mean(G), NG_sd=sd(G))
summary_df <- avB
summary_df <- merge(summary_df, avC, by=c(“A”))
summary_df <- merge(summary_df, avD, by=c(“A”))
summary_df <- merge(summary_df, avE, by=c(“A”))
summary_df <- merge(summary_df, avF, by=c(“A”))
summary_df <- merge(summary_df, avG, by=c(“A”))
#quick look at the correlation
plot((summary_df[,c(2,4,6,8,10,12)]), gap=0)
所以,事实上,我决定在 MySQL 中做很多这些,平均值,标准差等,然后在 R 中做最终的相关分析。但是,我不觉得很优雅。
为什么我使用数据框而不是数据table?因为我正在将 MySQL table 读入 R,语法 dbGetQuery(con,"select * from mysql_table")
returns 一个数据框。
你可以试试
library(dplyr)
all_df %>%
group_by(A) %>%
summarise_each(funs(mean, sd), B:G)
或者另一种选择是data.table
library(data.table)
setDT(all_df)[, lapply(.SD, function(x) c(mean(x), sd(x))), by = A,
.SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][]
注意:第一种形式的结果是宽格式,而第二种形式的结果是 'mean'、'sd' 作为替代行。
基准
all_df1 <- all_df[rep(1:nrow(all_df), 1e5),]
system.time(all_df1%>% group_by(A) %>% summarise_each(funs(mean, sd), B:G))
# user system elapsed
# 0.189 0.000 0.189
DT1 <- as.data.table(all_df1)
system.time(DT1[,lapply(.SD, function(x) c(mean(x), sd(x))),
A, .SDcols=LETTERS[2:6]][,var:= c('mean', 'sd')][])
# user system elapsed
#0.232 0.002 0.235
数据
set.seed(25)
m1 <- matrix(sample(1:20, 15*20, replace=TRUE), ncol=15)
set.seed(353)
all_df <- data.frame(sample(letters[1:3], 20, replace=TRUE), m1)
colnames(d1) <- LETTERS[1:ncol(d1)]
非常感谢阿克伦!
我根据您的回答编写了一个完整的示例,该示例也借鉴了 http://www.carlboettiger.info/2012/02/12/elegant-fast-data-manipulation-with-data-table.html 它还展示了如何调用由 lapply 生成的对象的特定元素。
#create a super large data frame:
grpsize = ceiling(1e7/26^2)
all_df1 <- data.frame(
x=rep(LETTERS,each=26*grpsize),
y=rep(letters,each=grpsize),
v=runif(grpsize*26^2),
v2=runif(grpsize*26^2),
stringsAsFactors=FALSE)
#to group by x and y andget length, mean from data frame
sumalldf <- ddply(all_df1, c("x","y"), summarise, ntotalldf = length(x), nmeanalldf = mean(v))
#convert to data.table
#more efficient way:
DT1 <- data.table(all_df1)
##less efficient way:
DT2 <- as.data.table(all_df1)
#set keys on x,y columns
setkey(DT1,x,y) # for x only, use: setkey(DT,x)
#setting the key as above allows calling by column value : DT1["A"]
#if you don't setkey and attempt the above, you'll get an error warning you to set key
#take a look at DT1
print(head(DT1))
print(tail(DT1))
#now group data table by x,y and get mean and standard deviation for all other columns
sumalldt <- DT1[,lapply(.SD, function(x) c(mean(x), sd(x))), by= list(x,y)][,var:= c('mean', 'sd')][]
#.SD stands for subset of data, in lay words it applies function (mean, sd) to all columns
#except the by columns
#take a look at this new object that holds the
#mean and standard deviation for all other columns
#after grouping by x,y
print(head(sumalldt))
print(tail(sumalldt))
#the keys for sumalldt get set by the 'by' components in lapply
print("some key, attributes etc for sumalldt")
print(key(sumalldt))
print(haskey(sumalldt))
#to get all values for x=B
#sumalldt["B"]
#to get all values for y=r
#sumalldt[list(unique(x),'r')]
#or
#sumalldt[y=="r"] # the former is more efficient
#say then you want to get the values only of x=B, y=r
print(paste("values for x=B, y=r"))
print(sumalldt[list('B','r')])
print("only the mean")
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v)
print(subset(sumalldt[list('B','r')],sumalldt[list('B','r')]$var=='mean')$v2)