data.table fread() - 跳过文档的第一部分
data.table fread() - first part of document skipped
美好的一天,
我有几个布局相同的文本文件,我想用 fread() 函数读入这些文件。 (请在此处查看两个示例文件:https://www.dropbox.com/sh/grpai6ppc6oq3ka/AADyECZHz5KW7wtv5xjF5-ena?dl=0)
文档分为两部分,第一部分包含 16 列,第二部分包含 7 列。我只想要第一部分的数据,而且只有第1列和第2列。
dat10 <- fread("CalcV10.txt", select = c(1,2), verbose=TRUE, col.names = c("Net", "Nrp"))
> head(dat10)
Net Nrp
1: 225 1
2: 247 1
3: 268 1
4: 287 1
5: 301 12
6: 302 4
这非常适用于我的一部分数据(例如 CalcV10),其中不止一行数据。
对于仅包含一行数据的另一个文件,但是跳过了第一部分,而是读取了文档的第二部分:
> head(dat3)
Net Nrp
1: 1000 9.9
2: 1000 14.8
3: 1000 12.7
4: 1000 14.8
5: 1000 11.7
6: 1000 14.8
我尝试更改行数 (colClasses=list(character=1:16)),但这没有帮助。我感谢每一个小提示!
最好的,亚奎林
我用的是data.table1.10.4版,R 3.3.2版,R Studio 1.0.136版(都是两周前更新的)
编辑
我有 40 个同名同布局的文件(Calc.txt)。它们在 20 个文件夹中,名为 V1 - V20,并且每个文件夹都有两个以两个 sim_types 命名的子文件夹。为了读取这些文本文件,我创建了以下函数:
read.res <- function(NrV, sim_type, FT) {
dat <- data.frame()
V <- paste("V", 1:NrV, sep="")
for (i in 1:NrV) {
Dir <- file.path(dataDir, V[i], sim_type)
setwd(Dir)
dat0 <- fread("Calc.txt", select = c(1,2), col.names = c("Net", "Nrp"))
dat0$type <- FT
dat<-rbind(dat, dat0)
}
dat<-as.data.frame(dat)
return(dat) }
Forest <- read.res(NrV=20, sim_type=sim_F, FT="F")
nonForest <- read.res(NrV=20, sim_type=sim_nF, FT="nF")
data <- rbind(Forest, nonForest)
@Sathish 一次读取一个文件效果很好,但如果对所有文件自动执行此步骤会很棒。我努力将 Sathish 的建议纳入我的职能。有什么想法吗?
library('data.table')
fn1 <- "CalcV3.txt"
fn2 <- "CalcV10.txt"
n1 <- grep('Sim_data', readLines(fn1)) - 5 # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn1, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F) # get file contents for the n1 rows
n2 <- grep('Sim_data', readLines(fn2)) - 5
x2 <- fread(fn2, nrows = n2, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)
# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
y <- strsplit(x, '\ ') # split string by space
y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to]) # remove blank characters
t(rbindlist(l = list( y ))) # combine list elements into data table
}
my_func(x1$V1, 1, 16) # all columns
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 1000 2100 7 10 11 12 0.9 1.9 2 2.2 12.3 14.8 17.1 42.1 -52.1 -40.1
my_func(x1$V1, 2, 4) # columns from 2 to 4
# [,1] [,2] [,3]
# V1 2100 7 10
my_func(x2$V1, 1, 16) # all columns
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 225 1 773.1 773.1 773.1 773.1 0.5 0.5 0.5 0.5 21.5 21.5 34.7 34.7 -42.5 -42.5
# V2 247 1 833.5 833.5 833.5 833.5 0.6 0.6 0.6 0.6 20.2 20.2 40.9 40.9 -15.4 -15.4
# V3 268 1 704.4 704.4 704.4 704.4 1.8 1.8 1.8 1.8 20.6 20.6 32.8 32.8 -42.9 -42.9
# V4 287 1 325.1 325.1 325.1 325.1 0.9 0.9 0.9 0.9 14.0 14.0 25.0 25.0 -42.1 -42.1
# V5 301 12 170.8 325.8 437.8 437.8 0.5 0.8 5.9 5.9 9.8 16.3 17.2 27.2 -32.2 -20.2
# V6 302 4 85.0 218.0 218.0 218.0 0.5 0.5 0.5 0.5 6.8 14.9 8.1 15.1 -38.4 -34.4
# V7 303 3 70.5 85.5 85.5 85.5 0.5 0.5 0.5 0.5 6.2 6.4 11.4 12.4 -26.9 -17.9
# V8 316 56 499.1 689.1 728.1 772.1 0.6 1.3 1.8 1.9 15.9 20.9 28.9 36.9 -38.6 -31.6
# V9 317 772 367.5 569.5 618.5 705.5 0.5 0.7 0.9 1.0 13.7 17.9 27.3 35.3 -26.6 -14.6
# V10 318 52 304.2 445.2 511.2 615.2 0.6 1.3 1.8 2.0 12.5 17.8 23.5 34.5 -21.6 0.4
# V11 319 4 412.3 527.3 527.3 527.3 0.6 0.7 0.7 0.7 15.1 20.9 21.9 33.9 -25.8 -4.8
# V12 330 14 107.7 264.7 421.7 421.7 0.5 0.8 1.3 1.3 8.2 14.4 14.7 27.7 -45.7 -27.7
# V13 331 872 229.3 406.3 468.3 531.3 0.5 1.0 1.5 2.3 11.7 17.1 19.2 28.2 -47.5 -37.5
# V14 332 35 428.1 690.1 728.1 774.1 1.1 3.2 4.1 4.8 17.0 22.6 22.6 35.6 -51.3 -35.3
# V15 333 4 452.0 523.0 523.0 523.0 0.7 1.0 1.0 1.0 15.8 17.1 28.5 29.5 -45.9 -38.9
# V16 1000 2100 143.6 200.6 215.6 232.6 1.2 2.1 2.3 2.4 12.4 14.8 8.1 17.1 -52.1 -41.1
编辑:
# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
y <- strsplit(x, '\ ') # split string by space
y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to]) # remove blank characters
t(rbindlist(l = list( y ))) # combine list elements into data table
}
root_path <- "temp" # Set `root_path` variable to a desired location
fdirs <- unlist(lapply(file.path(root_path, c(paste('V', 1:20, sep = ''))),
function(x) file.path(x, c(paste('sim_types', 1:2, sep = '')))))
all_dfs <- list() # this list contains data frames of all files
for ( i in fdirs)
{
require('data.table')
fn <- file.path(i, 'Calc.txt')
if ( file.exists( fn ) ){
n1 <- grep('Sim_data', readLines(fn)) - 5 # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F) # get file contents for the n1 rows
df <- my_func(x1$V1, 1, 2)
colnames(df) <- c('Net', 'Nrp')
all_dfs[[fn]] <- df
} else {
warning(paste('The file ', fn, ' does not exist!', sep = ''))
}
}
warnings()
# 38: The file temp/V20/sim_types2/Calc.txt does not exist!
all_dfs
# $`temp/V1/sim_types1/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V1/sim_types2/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V2/sim_types1/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V2/sim_types2/Calc.txt`
# Net Nrp
# V1 1000 2100
如果您想尝试文件和目录,请尝试这个可重现的示例,它将创建目录和文件。将 root_path
变量设置到所需位置。
# reproducible example
root_path <- "temp"
dirs <- file.path(root_path, c(paste('V', 1:20, sep = '')))
for(fpath in dirs)
{
dir.create(path = fpath, recursive = TRUE )
sub_dirs <- file.path(fpath, c(paste('sim_types', 1:2, sep = '')))
for( sfpath in sub_dirs){
dir.create(path = sfpath, recursive = TRUE )
file.create(file.path(sfpath, 'Calc.txt'))
}
}
美好的一天, 我有几个布局相同的文本文件,我想用 fread() 函数读入这些文件。 (请在此处查看两个示例文件:https://www.dropbox.com/sh/grpai6ppc6oq3ka/AADyECZHz5KW7wtv5xjF5-ena?dl=0) 文档分为两部分,第一部分包含 16 列,第二部分包含 7 列。我只想要第一部分的数据,而且只有第1列和第2列。
dat10 <- fread("CalcV10.txt", select = c(1,2), verbose=TRUE, col.names = c("Net", "Nrp"))
> head(dat10)
Net Nrp
1: 225 1
2: 247 1
3: 268 1
4: 287 1
5: 301 12
6: 302 4
这非常适用于我的一部分数据(例如 CalcV10),其中不止一行数据。
对于仅包含一行数据的另一个文件,但是跳过了第一部分,而是读取了文档的第二部分:
> head(dat3)
Net Nrp
1: 1000 9.9
2: 1000 14.8
3: 1000 12.7
4: 1000 14.8
5: 1000 11.7
6: 1000 14.8
我尝试更改行数 (colClasses=list(character=1:16)),但这没有帮助。我感谢每一个小提示!
最好的,亚奎林
我用的是data.table1.10.4版,R 3.3.2版,R Studio 1.0.136版(都是两周前更新的)
编辑
我有 40 个同名同布局的文件(Calc.txt)。它们在 20 个文件夹中,名为 V1 - V20,并且每个文件夹都有两个以两个 sim_types 命名的子文件夹。为了读取这些文本文件,我创建了以下函数:
read.res <- function(NrV, sim_type, FT) {
dat <- data.frame()
V <- paste("V", 1:NrV, sep="")
for (i in 1:NrV) {
Dir <- file.path(dataDir, V[i], sim_type)
setwd(Dir)
dat0 <- fread("Calc.txt", select = c(1,2), col.names = c("Net", "Nrp"))
dat0$type <- FT
dat<-rbind(dat, dat0)
}
dat<-as.data.frame(dat)
return(dat) }
Forest <- read.res(NrV=20, sim_type=sim_F, FT="F")
nonForest <- read.res(NrV=20, sim_type=sim_nF, FT="nF")
data <- rbind(Forest, nonForest)
@Sathish 一次读取一个文件效果很好,但如果对所有文件自动执行此步骤会很棒。我努力将 Sathish 的建议纳入我的职能。有什么想法吗?
library('data.table')
fn1 <- "CalcV3.txt"
fn2 <- "CalcV10.txt"
n1 <- grep('Sim_data', readLines(fn1)) - 5 # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn1, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F) # get file contents for the n1 rows
n2 <- grep('Sim_data', readLines(fn2)) - 5
x2 <- fread(fn2, nrows = n2, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)
# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
y <- strsplit(x, '\ ') # split string by space
y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to]) # remove blank characters
t(rbindlist(l = list( y ))) # combine list elements into data table
}
my_func(x1$V1, 1, 16) # all columns
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 1000 2100 7 10 11 12 0.9 1.9 2 2.2 12.3 14.8 17.1 42.1 -52.1 -40.1
my_func(x1$V1, 2, 4) # columns from 2 to 4
# [,1] [,2] [,3]
# V1 2100 7 10
my_func(x2$V1, 1, 16) # all columns
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 225 1 773.1 773.1 773.1 773.1 0.5 0.5 0.5 0.5 21.5 21.5 34.7 34.7 -42.5 -42.5
# V2 247 1 833.5 833.5 833.5 833.5 0.6 0.6 0.6 0.6 20.2 20.2 40.9 40.9 -15.4 -15.4
# V3 268 1 704.4 704.4 704.4 704.4 1.8 1.8 1.8 1.8 20.6 20.6 32.8 32.8 -42.9 -42.9
# V4 287 1 325.1 325.1 325.1 325.1 0.9 0.9 0.9 0.9 14.0 14.0 25.0 25.0 -42.1 -42.1
# V5 301 12 170.8 325.8 437.8 437.8 0.5 0.8 5.9 5.9 9.8 16.3 17.2 27.2 -32.2 -20.2
# V6 302 4 85.0 218.0 218.0 218.0 0.5 0.5 0.5 0.5 6.8 14.9 8.1 15.1 -38.4 -34.4
# V7 303 3 70.5 85.5 85.5 85.5 0.5 0.5 0.5 0.5 6.2 6.4 11.4 12.4 -26.9 -17.9
# V8 316 56 499.1 689.1 728.1 772.1 0.6 1.3 1.8 1.9 15.9 20.9 28.9 36.9 -38.6 -31.6
# V9 317 772 367.5 569.5 618.5 705.5 0.5 0.7 0.9 1.0 13.7 17.9 27.3 35.3 -26.6 -14.6
# V10 318 52 304.2 445.2 511.2 615.2 0.6 1.3 1.8 2.0 12.5 17.8 23.5 34.5 -21.6 0.4
# V11 319 4 412.3 527.3 527.3 527.3 0.6 0.7 0.7 0.7 15.1 20.9 21.9 33.9 -25.8 -4.8
# V12 330 14 107.7 264.7 421.7 421.7 0.5 0.8 1.3 1.3 8.2 14.4 14.7 27.7 -45.7 -27.7
# V13 331 872 229.3 406.3 468.3 531.3 0.5 1.0 1.5 2.3 11.7 17.1 19.2 28.2 -47.5 -37.5
# V14 332 35 428.1 690.1 728.1 774.1 1.1 3.2 4.1 4.8 17.0 22.6 22.6 35.6 -51.3 -35.3
# V15 333 4 452.0 523.0 523.0 523.0 0.7 1.0 1.0 1.0 15.8 17.1 28.5 29.5 -45.9 -38.9
# V16 1000 2100 143.6 200.6 215.6 232.6 1.2 2.1 2.3 2.4 12.4 14.8 8.1 17.1 -52.1 -41.1
编辑:
# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
y <- strsplit(x, '\ ') # split string by space
y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to]) # remove blank characters
t(rbindlist(l = list( y ))) # combine list elements into data table
}
root_path <- "temp" # Set `root_path` variable to a desired location
fdirs <- unlist(lapply(file.path(root_path, c(paste('V', 1:20, sep = ''))),
function(x) file.path(x, c(paste('sim_types', 1:2, sep = '')))))
all_dfs <- list() # this list contains data frames of all files
for ( i in fdirs)
{
require('data.table')
fn <- file.path(i, 'Calc.txt')
if ( file.exists( fn ) ){
n1 <- grep('Sim_data', readLines(fn)) - 5 # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F) # get file contents for the n1 rows
df <- my_func(x1$V1, 1, 2)
colnames(df) <- c('Net', 'Nrp')
all_dfs[[fn]] <- df
} else {
warning(paste('The file ', fn, ' does not exist!', sep = ''))
}
}
warnings()
# 38: The file temp/V20/sim_types2/Calc.txt does not exist!
all_dfs
# $`temp/V1/sim_types1/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V1/sim_types2/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V2/sim_types1/Calc.txt`
# Net Nrp
# V1 1000 2100
#
# $`temp/V2/sim_types2/Calc.txt`
# Net Nrp
# V1 1000 2100
如果您想尝试文件和目录,请尝试这个可重现的示例,它将创建目录和文件。将 root_path
变量设置到所需位置。
# reproducible example
root_path <- "temp"
dirs <- file.path(root_path, c(paste('V', 1:20, sep = '')))
for(fpath in dirs)
{
dir.create(path = fpath, recursive = TRUE )
sub_dirs <- file.path(fpath, c(paste('sim_types', 1:2, sep = '')))
for( sfpath in sub_dirs){
dir.create(path = sfpath, recursive = TRUE )
file.create(file.path(sfpath, 'Calc.txt'))
}
}