R如何在将csv文件写入磁盘之前估计它的大小

R How to estimate csv file size prior to writing it to disk

在通过 write.csvreadr::write_csv 将 csv 文件实际写入磁盘之前,R 中是否有任何方法可以估算 csv 文件的文件大小?如果用户不小心尝试在函数中将大文件写入磁盘,我想实现一个警告。

数据帧 (object.size) 的内存占用量与磁盘大小之间似乎存在某种关系,后者要大得多。但是,内存中的对象越大,差异越小。此外,可能存在与数据帧结构相关的差异。

我不想强迫人们下载大量数据,所以请原谅缺少可重现的例子。

尝试使用 object.size 方法:

 object.size(m) # where m is your data object

这是一个想法

to <- paste(capture.output(write.csv(USArrests)), collapse="\n")
write.csv(USArrests, tf <- tempfile(fileext = ".csv"))
file.info(tf)$size
# [1] 1438
print(object.size(to), units="b")
# 1480 bytes

您可以使用以下三个函数分别计算write.table()write.csv()write.csv2()将写入的文件的确切大小,而无需生成一次整个字节流,无论是在内存中还是在磁盘上:

size.write.table <- function(x,...) {
    x <- as.data.frame(x); ## write.table() coerces to data.frame
    args <- list(...);
    defaults <- formals(write.table);
    ## get write specs as locals
    for (name in names(defaults)[-1])
        assign(name,if (is.null(args[[name]])) defaults[[name]] else args[[name]]);
    ## normalize quote arg to logical, quoteIndexes as columns to quote
    ## note: regardless of qmethod, does not touch characters other than double-quote, and only adds one byte per embedded double-quote for either qmethod
    quoteIndexesGiven <- F; ## assumption
    if (is.logical(quote) && quote) {
        quoteIndexes <- seq_along(x);
    } else if (is.numeric(quote)) {
        quoteIndexes <- quote;
        quote <- T;
        quoteIndexesGiven <- T;
    }; ## end if
    ## normalize col.names arg to logical T/F, colNames as actual column names
    emptyColNameForRowNames <- F; ## assumption
    if (is.logical(col.names)) {
        if (is.na(col.names)) {
            emptyColNameForRowNames <- T;
            col.names <- T;
        }; ## end if
        if (col.names) colNames <- names(x);
    } else {
        colNames <- as.character(col.names);
        col.names <- T;
    }; ## end if
    ## normalize row.names arg to logical, rowNames as actual row names
    if (is.logical(row.names)) {
        if (row.names) rowNames <- rownames(x);
    } else {
        rowNames <- as.character(row.names);
        row.names <- T;
    }; ## end if (else must be F)
    ## start building up file size
    size <- 0L;
    ## 1: column header
    if (col.names) {
        ## special case for zero columns: write.table() behaves as if there's one empty-string column name, weirdly
        if (ncol(x)==0L) {
            if (quote) size <- size + 2L;
        } else {
            if (emptyColNameForRowNames) {
                if (quote) size <- size + 2L; ## two double-quotes
                size <- size + nchar(sep,'bytes'); ## separator
            }; ## end if
            size <- size + sum(nchar(colNames,'bytes')); ## names (note: NA works with this; nchar() returns 2)
            if (quote) size <- size + ncol(x)*2L + sum(do.call(c,gregexpr(perl=T,'"',colNames[quoteIndexes]))>0L); ## quotes and escapes
            size <- size + nchar(sep,'bytes')*(ncol(x)-1L); ## separators
        }; ## end if
        size <- size + nchar(eol,'bytes'); ## eol; applies to both zero-columns special case and otherwise
    }; ## end if
    ## 2: row names
    if (row.names) {
        ## note: missing values are not allowed in row names
        size <- size + sum(nchar(rowNames,'bytes')); ## names
        size <- size + nchar(sep,'bytes')*nrow(x); ## separator (note: always present after row names, even for zero-column data.frame)
        if (quote) size <- size + nrow(x)*2L + sum(do.call(c,gregexpr(perl=T,'"',rowNames))>0L); ## quotes and escapes (can ignore quoteIndexes, since row names are always quoted if any column is quoted)
    }; ## end if
    ## 3: column content
    for (ci in seq_along(x)) {
        ## calc depends on class
        cl <- class(x[[ci]]);
        ## normalize date/time classes
        if (identical(cl,c('POSIXct','POSIXt')) || identical(cl,c('POSIXlt','POSIXt')))
            cl <- 'POSIXt';
        ## branch on normalized class
        ## note: can't write list type to file, so don't bother supporting list columns
        if (length(cl)==1L && cl=='raw') {
            size <- size + nrow(x)*2L;
            ## note: cannot have raw NAs
        } else { ## remaining types can have NAs
            size <- size + sum(is.na(x[[ci]]))*nchar(na,'bytes'); ## NAs
            if (length(cl)==1L && cl=='logical') {
                size <- size + sum((5:4)[na.omit(x[[ci]])+1L]); ## non-NAs
            } else if (length(cl)==1L && cl%in%c('integer','numeric','complex','ts')) {
                size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs
            } else if (length(cl)==1L && cl%in%c('character','factor')) {
                size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs, values -- as.character() required for factors to work
                if (quote && ci%in%quoteIndexes) size <- size + sum(!is.na(x[[ci]]))*2L + sum(do.call(c,gregexpr(perl=T,'"',na.omit(x[[ci]])))>0L); ## quotes and escapes
            } else if (length(cl)==1L && cl=='POSIXt') {
                size <- size + sum(nchar(as.character(na.omit(x[[ci]])),'bytes')); ## non-NAs
                ## special case for POSIXt: only quoted if explicitly specified by index in quote arg
                if (quoteIndexesGiven && ci%in%quoteIndexes) size <- size + sum(!is.na(x[[ci]]))*2L; ## quotes (can't be any escapes)
            } else {
                stop(sprintf('unsupported class(es) %s.',paste(collapse=',',cl)));
            }; ## end if
        }; ## end if
    }; ## end for
    ## 4: separators between columns
    size <- size + nchar(sep,'bytes')*(ncol(x)-1L)*nrow(x);
    ## 5: eols
    size <- size + nchar(eol,'bytes')*nrow(x);
    size;
}; ## end size.write.table()
## note: documentation should say "col.names to NA if row.names = TRUE (the default) or given as a character vector" for csv functions
size.write.csv <- function(x,...) do.call(size.write.table,c(list(x),sep=',',dec='.',qmethod='double',col.names={ row.names <- list(...)$row.names; if (!identical(F,row.names)) NA else T; },list(...)));
size.write.csv2 <- function(x,...) do.call(size.write.table,c(list(x),sep=';',dec=',',qmethod='double',col.names={ row.names <- list(...)$row.names; if (!identical(F,row.names)) NA else T; },list(...)));

以下是证明正确性的一些测试:

size.write.func.test.impl <- function(funcName,x,...,tf='/tmp/size.write.func.test.impl.txt') {
    writeFunc <- match.fun(funcName);
    sizeFunc <- match.fun(paste0('size.',funcName));
    writeFunc(x,tf,...);
    expected <- file.info(tf)$size;
    actual <- sizeFunc(x,tf,...);
    cat(sprintf('%s: %d %s %d\n',if (expected==actual) 'SUCCESS' else 'FAILURE',actual,if (expected==actual) '==' else '!=',expected));
}; ## end size.write.func.test.impl()
size.write.table.test <- function(...) size.write.func.test.impl('write.table',...);
size.write.csv.test <- function(...) size.write.func.test.impl('write.csv',...);
size.write.csv2.test <- function(...) size.write.func.test.impl('write.csv2',...);
size.all.test <- function(...) {
    size.write.table.test(...);
    size.write.csv.test(...);
    size.write.csv2.test(...);
}; ## end size.all.test()

size.all.test(data.frame(),quote=F);
## SUCCESS: 1 == 1
## SUCCESS: 1 == 1
## SUCCESS: 1 == 1
size.all.test(data.frame());
## SUCCESS: 3 == 3
## SUCCESS: 3 == 3
## SUCCESS: 3 == 3
size.all.test(data.frame(a=1:3));
## SUCCESS: 22 == 22
## SUCCESS: 25 == 25
## SUCCESS: 25 == 25
set.seed(1L);
df <- data.frame(raw=as.raw(0:255),logical=rep(c(F,T),len=256L),integer=0:255,double1=runif(256L),double2=runif(256L,-.Machine$double.xmax*0.5,.Machine$double.xmax*0.5),character=paste(sapply(0:255,intToUtf8),sample(c('','x','x"x','"x""x"'),256L,replace=T)),factor=factor(rep(letters,len=256L)),dtΩ=as.POSIXct('1970-01-01 00:00:00',tz='England/London'),stringsAsFactors=F);
for (ci in seq(2,ncol(df))) df[[ci]][sample(256L,10L)] <- NA;
head(df);
##   raw logical integer   double1        double2 character factor        dtΩ
## 1  00   FALSE       0 0.2655087 -4.535097e+307                a 1970-01-01
## 2  01    TRUE       1 0.3721239 -2.670418e+305  [=11=]1 x"x      b 1970-01-01
## 3  02   FALSE       2 0.5728534 -2.285466e+307  [=11=]2 x"x      c 1970-01-01
## 4  03    TRUE       3 0.9082078  7.814417e+307     [=11=]3       d 1970-01-01
## 5  04      NA       4 0.2016819  4.311961e+306  [=11=]4 x"x      e 1970-01-01
## 6  05    TRUE       5 0.8983897 -3.287178e+307  [=11=]5 x"x      f 1970-01-01
size.all.test(df);
## SUCCESS: 20634 == 20634
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
size.all.test(df,eol='zzz');
## SUCCESS: 21148 == 21148
## SUCCESS: 21151 == 21151
## SUCCESS: 21151 == 21151
size.all.test(df,sep='///'); ## csv incarnations take ownership of their overridden arguments
## SUCCESS: 24744 == 24744
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
## Warning messages:
## 1: In writeFunc(x, tf, ...) : attempt to set 'sep' ignored
## 2: In writeFunc(x, tf, ...) : attempt to set 'sep' ignored
size.all.test(df,quote=F);
## SUCCESS: 18807 == 18807
## SUCCESS: 18808 == 18808
## SUCCESS: 18808 == 18808
size.all.test(df,quote=seq(2L,ncol(df),by=2L));
## SUCCESS: 20634 == 20634
## SUCCESS: 20637 == 20637
## SUCCESS: 20637 == 20637
size.all.test(df,row.names=F);
## SUCCESS: 19206 == 19206
## SUCCESS: 19206 == 19206
## SUCCESS: 19206 == 19206
size.all.test(df,row.names=seq(1234,len=nrow(df)));
## SUCCESS: 20998 == 20998
## SUCCESS: 21001 == 21001
## SUCCESS: 21001 == 21001
size.all.test(df,na='blah');
## SUCCESS: 20774 == 20774
## SUCCESS: 20777 == 20777
## SUCCESS: 20777 == 20777
size.all.test(iris);
## SUCCESS: 4818 == 4818
## SUCCESS: 4821 == 4821
## SUCCESS: 4821 == 4821
size.all.test(USAccDeaths);
## SUCCESS: 724 == 724
## SUCCESS: 727 == 727
## SUCCESS: 727 == 727
size.all.test(USArrests);
## SUCCESS: 1384 == 1384
## SUCCESS: 1387 == 1387
## SUCCESS: 1387 == 1387
size.all.test(USArrests,eol='\r\n'); ## you're probably on Windows
## SUCCESS: 1435 == 1435
## SUCCESS: 1438 == 1438
## SUCCESS: 1438 == 1438

受@lukeA 回答的启发,我得出了以下估计文件大小的良好结果:

csv_write <- function(df = idata, filename){
  if(interactive){
    if(dim(df)[1] > 10^4){
      divisor <- 1000
      if(dim(df)[1] > 10^5){
        divisor <- 10000
      }

      df_sample <- sample_n(df, size = dim(df)[1] %/% divisor)

      test <- paste(capture.output(write.csv(df_sample)), collapse="\n")

      cat("Writing", capture.output(print(object.size(test)*divisor,
                                          units="auto")), "to disk. \n")
    }
  }

  write_csv(df, path = filename)
}

剩下的一个问题是我需要使用 write.csv 而不是 readr::write_csv 来计算近似输出大小,因为 readr::write_csv 需要一个 path 参数。不需要路径的替代方案 readr::format_csv 大大低估了文件大小(例如,估计为 2.7 Gb 与磁盘上的 3.3 Gb)。