按 Space 拆分列
Split Column By Space
DATA <- data.frame("V1" = c("SCORE : 9.931 5.092",
"SCORE : 6.00007 15.1248",
"SCORE : 1.0002 12.987532",
"SCORE : 3.1 3.98532"))
WANT <- data.frame("VAR1" = c(9.931, 6.00007, 1.0002, 3.1),
'VAR2' = c(5.092, 15.1248, 12.987532, 3.98532))
我拥有的是学生考试成绩数据,其输入方式如 'DATA' 所示,但我希望将其拆分,以便我拥有的就像 'WANT' 框架中显示的那样第一个数字在 'VAR1' 中,第二个数字在 'VAR2' 中,忽略空格
我的尝试:
DATA[, c("VAR1", "VAR2") := trimws(V1, whitespace = ".*:\s+")]
生产:
我们可以去掉带trimws
的前缀子串,使用read.table
读取列,默认为sep
space到return两列data.frame
在 base R
read.table(text = trimws(DATA$V1, whitespace = ".*:\s+"),
header = FALSE, col.names = c("VAR1", "VAR2"))
VAR1 VAR2
1 9.93100 5.09200
2 6.00007 15.12480
3 1.00020 12.98753
4 3.10000 3.98532
或者可以使用 extract
来自 tidyr
library(tidyr)
extract(DATA, V1, into = c("VAR1", "VAR2"),
".*:\s+([0-9.]+)\s+([0-9.]+)", convert = TRUE)
VAR1 VAR2
1 9.93100 5.09200
2 6.00007 15.12480
3 1.00020 12.98753
4 3.10000 3.98532
如果我们要data.table
,用同样的方法可以在去掉前缀子串后用fread
读取
library(data.table)
fread(text = setDT(DATA)[, trimws(V1, whitespace = ".*:\s+")],
col.names = c("VAR1", "VAR2"))
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或使用 fread
中的 select
选项
fread(text = DATA$V1, select = c(3, 4), col.names = c("VAR1", "VAR2"))
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或读作四列,并子集
fread(text = DATA$V1)[, .(VAR1 = V3, VAR2 = V4)]
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或者可以使用tstrsplit
setDT(DATA)[, c("VAR1", "VAR2") := tstrsplit(trimws(V1,
whitespace = ".*:\s+"), " ")]
DATA <- type.convert(DATA, as.is = TRUE)
DATA
V1 VAR1 VAR2
<char> <num> <num>
1: SCORE : 9.931 5.092 9.93100 5.09200
2: SCORE : 6.00007 15.1248 6.00007 15.12480
3: SCORE : 1.0002 12.987532 1.00020 12.98753
4: SCORE : 3.1 3.98532 3.10000 3.98532
DATA <- data.frame("V1" = c("SCORE : 9.931 5.092",
"SCORE : 6.00007 15.1248",
"SCORE : 1.0002 12.987532",
"SCORE : 3.1 3.98532"))
WANT <- data.frame("VAR1" = c(9.931, 6.00007, 1.0002, 3.1),
'VAR2' = c(5.092, 15.1248, 12.987532, 3.98532))
我拥有的是学生考试成绩数据,其输入方式如 'DATA' 所示,但我希望将其拆分,以便我拥有的就像 'WANT' 框架中显示的那样第一个数字在 'VAR1' 中,第二个数字在 'VAR2' 中,忽略空格
我的尝试:
DATA[, c("VAR1", "VAR2") := trimws(V1, whitespace = ".*:\s+")]
生产:
我们可以去掉带trimws
的前缀子串,使用read.table
读取列,默认为sep
space到return两列data.frame
在 base R
read.table(text = trimws(DATA$V1, whitespace = ".*:\s+"),
header = FALSE, col.names = c("VAR1", "VAR2"))
VAR1 VAR2
1 9.93100 5.09200
2 6.00007 15.12480
3 1.00020 12.98753
4 3.10000 3.98532
或者可以使用 extract
来自 tidyr
library(tidyr)
extract(DATA, V1, into = c("VAR1", "VAR2"),
".*:\s+([0-9.]+)\s+([0-9.]+)", convert = TRUE)
VAR1 VAR2
1 9.93100 5.09200
2 6.00007 15.12480
3 1.00020 12.98753
4 3.10000 3.98532
如果我们要data.table
,用同样的方法可以在去掉前缀子串后用fread
读取
library(data.table)
fread(text = setDT(DATA)[, trimws(V1, whitespace = ".*:\s+")],
col.names = c("VAR1", "VAR2"))
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或使用 fread
select
选项
fread(text = DATA$V1, select = c(3, 4), col.names = c("VAR1", "VAR2"))
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或读作四列,并子集
fread(text = DATA$V1)[, .(VAR1 = V3, VAR2 = V4)]
VAR1 VAR2
<num> <num>
1: 9.93100 5.09200
2: 6.00007 15.12480
3: 1.00020 12.98753
4: 3.10000 3.98532
或者可以使用tstrsplit
setDT(DATA)[, c("VAR1", "VAR2") := tstrsplit(trimws(V1,
whitespace = ".*:\s+"), " ")]
DATA <- type.convert(DATA, as.is = TRUE)
DATA
V1 VAR1 VAR2
<char> <num> <num>
1: SCORE : 9.931 5.092 9.93100 5.09200
2: SCORE : 6.00007 15.1248 6.00007 15.12480
3: SCORE : 1.0002 12.987532 1.00020 12.98753
4: SCORE : 3.1 3.98532 3.10000 3.98532