data.table fread() - 跳过文档的第一部分

Question

美好的一天，我有几个布局相同的文本文件，我想用 fread() 函数读入这些文件。（请在此处查看两个示例文件：https://www.dropbox.com/sh/grpai6ppc6oq3ka/AADyECZHz5KW7wtv5xjF5-ena?dl=0）文档分为两部分，第一部分包含 16 列，第二部分包含 7 列。我只想要第一部分的数据，而且只有第1列和第2列。

 dat10 <- fread("CalcV10.txt", select = c(1,2), verbose=TRUE, col.names = c("Net", "Nrp"))

> head(dat10)
Net Nrp
1: 225   1
2: 247   1
3: 268   1
4: 287   1
5: 301  12
6: 302   4

这非常适用于我的一部分数据（例如 CalcV10），其中不止一行数据。

对于仅包含一行数据的另一个文件，但是跳过了第一部分，而是读取了文档的第二部分：

> head(dat3)
Net  Nrp
1: 1000      9.9   
2: 1000     14.8   
3: 1000     12.7    
4: 1000     14.8    
5: 1000     11.7    
6: 1000     14.8

我尝试更改行数 (colClasses=list(character=1:16))，但这没有帮助。我感谢每一个小提示！

最好的，亚奎林

我用的是data.table1.10.4版，R 3.3.2版，R Studio 1.0.136版（都是两周前更新的）

编辑

我有 40 个同名同布局的文件(Calc.txt)。它们在 20 个文件夹中，名为 V1 - V20，并且每个文件夹都有两个以两个 sim_types 命名的子文件夹。为了读取这些文本文件，我创建了以下函数：

   read.res <- function(NrV, sim_type, FT) {
   dat <- data.frame()
   V <- paste("V", 1:NrV, sep="")

   for (i in 1:NrV) {
   Dir <- file.path(dataDir, V[i], sim_type)
   setwd(Dir)
   dat0 <- fread("Calc.txt", select = c(1,2), col.names = c("Net", "Nrp"))
   dat0$type <- FT
    dat<-rbind(dat, dat0)
   }
  dat<-as.data.frame(dat)
  return(dat) }

  Forest <- read.res(NrV=20, sim_type=sim_F,  FT="F") 
  nonForest <- read.res(NrV=20, sim_type=sim_nF, FT="nF") 
  data <- rbind(Forest, nonForest)

@Sathish 一次读取一个文件效果很好，但如果对所有文件自动执行此步骤会很棒。我努力将 Sathish 的建议纳入我的职能。有什么想法吗？

Answer 1

library('data.table')
fn1 <- "CalcV3.txt"
fn2 <- "CalcV10.txt"

n1 <- grep('Sim_data', readLines(fn1)) - 5  # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn1, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)   # get file contents for the n1 rows

n2 <- grep('Sim_data', readLines(fn2)) - 5
x2 <- fread(fn2, nrows = n2, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)

# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
  y <- strsplit(x, '\ ')   # split string by space
  y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to])   # remove blank characters
  t(rbindlist(l = list( y )))  # combine list elements into data table
}

my_func(x1$V1, 1, 16)   # all columns
#    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 1000 2100    7   10   11   12  0.9  1.9    2   2.2  12.3  14.8  17.1  42.1 -52.1 -40.1

my_func(x1$V1, 2, 4)  # columns from 2 to 4
#    [,1] [,2] [,3]
# V1 2100    7   10

my_func(x2$V1, 1, 16)  # all columns
#     [,1] [,2]  [,3]  [,4]  [,5]  [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1   225    1 773.1 773.1 773.1 773.1  0.5  0.5  0.5   0.5  21.5  21.5  34.7  34.7 -42.5 -42.5
# V2   247    1 833.5 833.5 833.5 833.5  0.6  0.6  0.6   0.6  20.2  20.2  40.9  40.9 -15.4 -15.4
# V3   268    1 704.4 704.4 704.4 704.4  1.8  1.8  1.8   1.8  20.6  20.6  32.8  32.8 -42.9 -42.9
# V4   287    1 325.1 325.1 325.1 325.1  0.9  0.9  0.9   0.9  14.0  14.0  25.0  25.0 -42.1 -42.1
# V5   301   12 170.8 325.8 437.8 437.8  0.5  0.8  5.9   5.9   9.8  16.3  17.2  27.2 -32.2 -20.2
# V6   302    4  85.0 218.0 218.0 218.0  0.5  0.5  0.5   0.5   6.8  14.9   8.1  15.1 -38.4 -34.4
# V7   303    3  70.5  85.5  85.5  85.5  0.5  0.5  0.5   0.5   6.2   6.4  11.4  12.4 -26.9 -17.9
# V8   316   56 499.1 689.1 728.1 772.1  0.6  1.3  1.8   1.9  15.9  20.9  28.9  36.9 -38.6 -31.6
# V9   317  772 367.5 569.5 618.5 705.5  0.5  0.7  0.9   1.0  13.7  17.9  27.3  35.3 -26.6 -14.6
# V10  318   52 304.2 445.2 511.2 615.2  0.6  1.3  1.8   2.0  12.5  17.8  23.5  34.5 -21.6   0.4
# V11  319    4 412.3 527.3 527.3 527.3  0.6  0.7  0.7   0.7  15.1  20.9  21.9  33.9 -25.8  -4.8
# V12  330   14 107.7 264.7 421.7 421.7  0.5  0.8  1.3   1.3   8.2  14.4  14.7  27.7 -45.7 -27.7
# V13  331  872 229.3 406.3 468.3 531.3  0.5  1.0  1.5   2.3  11.7  17.1  19.2  28.2 -47.5 -37.5
# V14  332   35 428.1 690.1 728.1 774.1  1.1  3.2  4.1   4.8  17.0  22.6  22.6  35.6 -51.3 -35.3
# V15  333    4 452.0 523.0 523.0 523.0  0.7  1.0  1.0   1.0  15.8  17.1  28.5  29.5 -45.9 -38.9
# V16 1000 2100 143.6 200.6 215.6 232.6  1.2  2.1  2.3   2.4  12.4  14.8   8.1  17.1 -52.1 -41.1

编辑：

# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
  y <- strsplit(x, '\ ')   # split string by space
  y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to])   # remove blank characters
  t(rbindlist(l = list( y )))  # combine list elements into data table
}

root_path <- "temp"   # Set `root_path` variable to a desired location
fdirs <- unlist(lapply(file.path(root_path, c(paste('V', 1:20, sep = ''))),
                       function(x) file.path(x, c(paste('sim_types', 1:2, sep = '')))))

all_dfs <- list()  # this list contains data frames of all files
for ( i in fdirs)
{
  require('data.table')
  fn <- file.path(i, 'Calc.txt')

  if ( file.exists( fn ) ){
    n1 <- grep('Sim_data', readLines(fn)) - 5  # get the line number matching Sim_data and subtract 5 to it
    x1 <- fread(fn, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)   # get file contents for the n1 rows
    df <- my_func(x1$V1, 1, 2)
    colnames(df) <- c('Net', 'Nrp')
    all_dfs[[fn]] <- df
  } else {
    warning(paste('The file ', fn, ' does not exist!', sep = ''))
  }
}

warnings()
# 38: The file temp/V20/sim_types2/Calc.txt does not exist!

all_dfs
# $`temp/V1/sim_types1/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V1/sim_types2/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V2/sim_types1/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V2/sim_types2/Calc.txt`
# Net  Nrp
# V1 1000 2100

如果您想尝试文件和目录，请尝试这个可重现的示例，它将创建目录和文件。将 root_path 变量设置到所需位置。

# reproducible example
root_path <- "temp"

dirs <- file.path(root_path, c(paste('V', 1:20, sep = '')))

for(fpath in dirs)
{
  dir.create(path = fpath, recursive = TRUE )

  sub_dirs <- file.path(fpath, c(paste('sim_types', 1:2, sep = '')))
  for( sfpath in sub_dirs){
    dir.create(path = sfpath, recursive = TRUE )
    file.create(file.path(sfpath, 'Calc.txt'))
  }
}

data.table fread() - 跳过文档的第一部分

data.table fread() - first part of document skipped

r

fread

data.table