fread 读取错误 "Expected sep (' ') but.."
fread read error "Expected sep (' ') but.."
fread
{data.table} 在处理包含部分引用行的制表符分隔文件时遇到困难。我找不到解决方法,因为它会自动处理引号(因此没有 quote
参数,因为 read.csv
有)。这说明:
str1 = 'L1\tsome\tunquoted\tstuff\nL2\tsome\t"half" quoted\tstuff\nL3\tthis\t"should work"\tok thought'
str2 = gsub('"', '', str1)
fread(str2, sep='\t', header=F, skip=0L)
# V1 V2 V3 V4
# 1: L1 some unquoted stuff
# 2: L2 some half quoted stuff
# 3: L3 this should work ok thought
fread(str1, sep='\t', header=F, skip=0L)
# Error in fread(str1, sep = "\t", header = F, skip = 0L) :
# Expected sep (' ') but '
# ' ends field 3 on line 1 when detecting types: L2 some "half" quoted stuff
除了对原始文件做 find/replace 之外,有什么办法可以解决这个问题吗?
stringi
怎么样?很容易弄清楚,而且非常有效。还有一个函数 stri_read_lines()
,用于文件中的 reading/splitting 行。
library(stringi)
as.data.frame(stri_split_fixed(stri_split_lines1(str1), "\t", simplify = TRUE))
# V1 V2 V3 V4
# 1 L1 some unquoted stuff
# 2 L2 some "half" quoted stuff
# 3 L3 this "should work" ok thought
如果您需要证明这是一种比 read.*()
更有效的方法,请查看上述方法应用于解析为 30k 行的扁平化字符串时的计时。您还可以通过调整 as.data.frame()
中的参数来加快速度。对于这个例子,stringi
方法大约是 read.table()
.
方法的两倍
str1 <- "L1\tsome\tunquoted\tstuff\nL2\tsome\t\"half\" quoted\tstuff\nL3\tthis\t\"should work\"\tok thought"
library(stringi)
library(microbenchmark)
write(stri_flatten(rep(str1, 1e5), collapse = "\n"))
file.info("data")[1]
# size
# data 8400000
microbenchmark(
stringi = {
mat <- stri_split_fixed(stri_read_lines("data"), "\t", simplify = TRUE)
out <- as.data.frame(mat)
},
read.table = {
out2 <- read.table("data", sep = "\t", quote = "\n")
},
times = 3L,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# stringi 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 3 a
# read.table 2.074071 2.111722 1.997857 2.148897 1.96356 1.808365 3 b
identical(out, out2)
# [1] TRUE
fread
{data.table} 在处理包含部分引用行的制表符分隔文件时遇到困难。我找不到解决方法,因为它会自动处理引号(因此没有 quote
参数,因为 read.csv
有)。这说明:
str1 = 'L1\tsome\tunquoted\tstuff\nL2\tsome\t"half" quoted\tstuff\nL3\tthis\t"should work"\tok thought'
str2 = gsub('"', '', str1)
fread(str2, sep='\t', header=F, skip=0L)
# V1 V2 V3 V4
# 1: L1 some unquoted stuff
# 2: L2 some half quoted stuff
# 3: L3 this should work ok thought
fread(str1, sep='\t', header=F, skip=0L)
# Error in fread(str1, sep = "\t", header = F, skip = 0L) :
# Expected sep (' ') but '
# ' ends field 3 on line 1 when detecting types: L2 some "half" quoted stuff
除了对原始文件做 find/replace 之外,有什么办法可以解决这个问题吗?
stringi
怎么样?很容易弄清楚,而且非常有效。还有一个函数 stri_read_lines()
,用于文件中的 reading/splitting 行。
library(stringi)
as.data.frame(stri_split_fixed(stri_split_lines1(str1), "\t", simplify = TRUE))
# V1 V2 V3 V4
# 1 L1 some unquoted stuff
# 2 L2 some "half" quoted stuff
# 3 L3 this "should work" ok thought
如果您需要证明这是一种比 read.*()
更有效的方法,请查看上述方法应用于解析为 30k 行的扁平化字符串时的计时。您还可以通过调整 as.data.frame()
中的参数来加快速度。对于这个例子,stringi
方法大约是 read.table()
.
str1 <- "L1\tsome\tunquoted\tstuff\nL2\tsome\t\"half\" quoted\tstuff\nL3\tthis\t\"should work\"\tok thought"
library(stringi)
library(microbenchmark)
write(stri_flatten(rep(str1, 1e5), collapse = "\n"))
file.info("data")[1]
# size
# data 8400000
microbenchmark(
stringi = {
mat <- stri_split_fixed(stri_read_lines("data"), "\t", simplify = TRUE)
out <- as.data.frame(mat)
},
read.table = {
out2 <- read.table("data", sep = "\t", quote = "\n")
},
times = 3L,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# stringi 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 3 a
# read.table 2.074071 2.111722 1.997857 2.148897 1.96356 1.808365 3 b
identical(out, out2)
# [1] TRUE