将行拆分为 R 中的列
Split rows to columns in R
我有一个这样的文本数据文件(多行对应类别 [A,B,C]):
A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14
当我加载到 R 时,它变成了一个有 6 个 obs 的数据框。和 1 个变量
df <- read.delim("file.text",header = FALSE)
v1
1 A=1,2,3,6,
2 7,9
3 10
4 B=3,4,5
5 C=5,7,8,10,11,
6 13,14
如何更改为这种格式?
A B C
1 A
2 A
3 A B
4 B
5 B C
6 A
7 A C
8 C
9 A
10 A C
11 C
13 C
14 C
谢谢!
基本方法是将数据作为字符串读取,拆分为 =
和 ,
,然后找出识别每个数字属于哪个组的最佳方法。
在下面的方法中,我有意使用类型转换来创建组。由于这将涉及从字符到数字的强制转换,因为拆分值中有字符,所以预计会有几条警告消息。
# Load the data.table package.
library(data.table)
# Read in the data.
x <- fread("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14", sep = "\n", header = FALSE)
x[, unlist(strsplit(V1, "=|,"), use.names = FALSE, recursive = FALSE), .I][
, list(ind = as.integer(V1), col = rep(V1[1], .N)), cumsum(is.na(as.integer(V1)))][
, dcast(na.omit(.SD), ind ~ col, value.var = "col", fill = "")]
# ind A B C
# 1: 1 A
# 2: 2 A
# 3: 3 A B
# 4: 4 B
# 5: 5 B C
# 6: 6 A
# 7: 7 A C
# 8: 8 C
# 9: 9 A
# 10: 10 A C
# 11: 11 C
# 12: 13 C
# 13: 14 C
这是另一个使用我的 "splitstackshape" 包中的 cSplit
的替代方法。 "x" 与使用 fread
.
读取的数据相同
library(splitstackshape)
cSplit(
cSplit(x[, toString(V1), cumsum(grepl("[A-Z]", V1))], "V1", "="), "V1_2", ",", "long")[
, dcast(.SD, V1_2 ~ V1_1, value.var = "V1_1", fill = "")]
使用 readLines()
的基础 R 方法。我们首先将以字符开头的单元格索引存储在向量 ch
中,这有助于找到 paste
的序列一起进入 l2
。 l2
as.numeric
已经给出了最终数据帧的索引 d
。我们从 l3
的维度构造 NA
出 NA
并根据 l3
的值填充它。
l <- readLines("delim.txt")
ch <- c(grep("^\D", l))
l2 <- apply(rbind(ch, c(ch[-1] - 1, length(l))), 2,
function(x) Reduce(paste, l[x[1]:x[2]]))
l3 <- lapply(sapply(strsplit(l2, "\D"), as.numeric), na.omit)
d <- matrix(NA, max(sapply(l3, max)), length(l3))
let <- gsub("(.*)\=.*", "\1", l2) # the col-names
sapply(seq(length(l3)), function(n) d[l3[[n]], n] <<- let[n])
setNames(as.data.frame(d), let)
# A B C
# 1 A <NA> <NA>
# 2 A <NA> <NA>
# 3 A B <NA>
# 4 <NA> B <NA>
# 5 <NA> B C
# 6 A <NA> <NA>
# 7 A <NA> C
# 8 <NA> <NA> C
# 9 A <NA> <NA>
# 10 A <NA> C
# 11 <NA> <NA> C
# 12 <NA> <NA> <NA>
# 13 <NA> <NA> C
# 14 <NA> <NA> C
这是一个tidyverse
方法。
library(dplyr)
library(tidyr)
df %>%
#Remove commas at the end of the string
mutate(V1 = sub(",$", "", V1)) %>%
#Divide data into separate column based on "=" sign
separate(V1, into = c("let", "num"), sep = "=", fill = "left") %>%
#fill the NA values with it's respective group
fill(let) %>%
#Convert comma-separated value into different rows
separate_rows(num, convert = TRUE) %>%
#Convert data to wide format filling missing values with empty string
pivot_wider(names_from = let, values_from = let, values_fill = list(let = "")) %>%
#Arrange data according to numbers
arrange(num)
# # A tibble: 13 x 4
# num A B C
# <int> <chr> <chr> <chr>
# 1 1 A "" ""
# 2 2 A "" ""
# 3 3 A B ""
# 4 4 "" B ""
# 5 5 "" B C
# 6 6 A "" ""
# 7 7 A "" C
# 8 8 "" "" C
# 9 9 A "" ""
#10 10 A "" C
#11 11 "" "" C
#12 13 "" "" C
#13 14 "" "" C
如果您希望 num
列作为行名,请添加 %>% column_to_rownames('num')
.
数据
df <- read.table(text = "A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14")
在 base 中你可以 paste
行到一行,使用 strsplit
获取单个数字和列名,创建一个命名的 matrix
并使用子集填充它。
x <- readLines(con=textConnection("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14"))
x <- paste(gsub("=","",x), collapse = ",")
y <- lapply(strsplit(unlist(strsplit(x, "[[:alpha:]]+"))[-1], ","), function(i) i[!i==""])
names(y) <- strsplit(x, "[^[:alpha:]]+")[[1]]
z <- sort(unique(as.numeric(unlist(y))))
res <- matrix("", nrow=length(z), ncol=length(y), dimnames=list(z, names(y)))
sapply(names(y), function(i) res[y[[i]], i] <<- i)
res
# A B C
#1 "A" "" ""
#2 "A" "" ""
#3 "A" "B" ""
#4 "" "B" ""
#5 "" "B" "C"
#6 "A" "" ""
#7 "A" "" "C"
#8 "" "" "C"
#9 "A" "" ""
#10 "A" "" "C"
#11 "" "" "C"
#13 "" "" "C"
#14 "" "" "C"
我有一个这样的文本数据文件(多行对应类别 [A,B,C]):
A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14
当我加载到 R 时,它变成了一个有 6 个 obs 的数据框。和 1 个变量
df <- read.delim("file.text",header = FALSE)
v1
1 A=1,2,3,6,
2 7,9
3 10
4 B=3,4,5
5 C=5,7,8,10,11,
6 13,14
如何更改为这种格式?
A B C
1 A
2 A
3 A B
4 B
5 B C
6 A
7 A C
8 C
9 A
10 A C
11 C
13 C
14 C
谢谢!
基本方法是将数据作为字符串读取,拆分为 =
和 ,
,然后找出识别每个数字属于哪个组的最佳方法。
在下面的方法中,我有意使用类型转换来创建组。由于这将涉及从字符到数字的强制转换,因为拆分值中有字符,所以预计会有几条警告消息。
# Load the data.table package.
library(data.table)
# Read in the data.
x <- fread("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14", sep = "\n", header = FALSE)
x[, unlist(strsplit(V1, "=|,"), use.names = FALSE, recursive = FALSE), .I][
, list(ind = as.integer(V1), col = rep(V1[1], .N)), cumsum(is.na(as.integer(V1)))][
, dcast(na.omit(.SD), ind ~ col, value.var = "col", fill = "")]
# ind A B C
# 1: 1 A
# 2: 2 A
# 3: 3 A B
# 4: 4 B
# 5: 5 B C
# 6: 6 A
# 7: 7 A C
# 8: 8 C
# 9: 9 A
# 10: 10 A C
# 11: 11 C
# 12: 13 C
# 13: 14 C
这是另一个使用我的 "splitstackshape" 包中的 cSplit
的替代方法。 "x" 与使用 fread
.
library(splitstackshape)
cSplit(
cSplit(x[, toString(V1), cumsum(grepl("[A-Z]", V1))], "V1", "="), "V1_2", ",", "long")[
, dcast(.SD, V1_2 ~ V1_1, value.var = "V1_1", fill = "")]
使用 readLines()
的基础 R 方法。我们首先将以字符开头的单元格索引存储在向量 ch
中,这有助于找到 paste
的序列一起进入 l2
。 l2
as.numeric
已经给出了最终数据帧的索引 d
。我们从 l3
的维度构造 NA
出 NA
并根据 l3
的值填充它。
l <- readLines("delim.txt")
ch <- c(grep("^\D", l))
l2 <- apply(rbind(ch, c(ch[-1] - 1, length(l))), 2,
function(x) Reduce(paste, l[x[1]:x[2]]))
l3 <- lapply(sapply(strsplit(l2, "\D"), as.numeric), na.omit)
d <- matrix(NA, max(sapply(l3, max)), length(l3))
let <- gsub("(.*)\=.*", "\1", l2) # the col-names
sapply(seq(length(l3)), function(n) d[l3[[n]], n] <<- let[n])
setNames(as.data.frame(d), let)
# A B C
# 1 A <NA> <NA>
# 2 A <NA> <NA>
# 3 A B <NA>
# 4 <NA> B <NA>
# 5 <NA> B C
# 6 A <NA> <NA>
# 7 A <NA> C
# 8 <NA> <NA> C
# 9 A <NA> <NA>
# 10 A <NA> C
# 11 <NA> <NA> C
# 12 <NA> <NA> <NA>
# 13 <NA> <NA> C
# 14 <NA> <NA> C
这是一个tidyverse
方法。
library(dplyr)
library(tidyr)
df %>%
#Remove commas at the end of the string
mutate(V1 = sub(",$", "", V1)) %>%
#Divide data into separate column based on "=" sign
separate(V1, into = c("let", "num"), sep = "=", fill = "left") %>%
#fill the NA values with it's respective group
fill(let) %>%
#Convert comma-separated value into different rows
separate_rows(num, convert = TRUE) %>%
#Convert data to wide format filling missing values with empty string
pivot_wider(names_from = let, values_from = let, values_fill = list(let = "")) %>%
#Arrange data according to numbers
arrange(num)
# # A tibble: 13 x 4
# num A B C
# <int> <chr> <chr> <chr>
# 1 1 A "" ""
# 2 2 A "" ""
# 3 3 A B ""
# 4 4 "" B ""
# 5 5 "" B C
# 6 6 A "" ""
# 7 7 A "" C
# 8 8 "" "" C
# 9 9 A "" ""
#10 10 A "" C
#11 11 "" "" C
#12 13 "" "" C
#13 14 "" "" C
如果您希望 num
列作为行名,请添加 %>% column_to_rownames('num')
.
数据
df <- read.table(text = "A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14")
在 base 中你可以 paste
行到一行,使用 strsplit
获取单个数字和列名,创建一个命名的 matrix
并使用子集填充它。
x <- readLines(con=textConnection("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14"))
x <- paste(gsub("=","",x), collapse = ",")
y <- lapply(strsplit(unlist(strsplit(x, "[[:alpha:]]+"))[-1], ","), function(i) i[!i==""])
names(y) <- strsplit(x, "[^[:alpha:]]+")[[1]]
z <- sort(unique(as.numeric(unlist(y))))
res <- matrix("", nrow=length(z), ncol=length(y), dimnames=list(z, names(y)))
sapply(names(y), function(i) res[y[[i]], i] <<- i)
res
# A B C
#1 "A" "" ""
#2 "A" "" ""
#3 "A" "B" ""
#4 "" "B" ""
#5 "" "B" "C"
#6 "A" "" ""
#7 "A" "" "C"
#8 "" "" "C"
#9 "A" "" ""
#10 "A" "" "C"
#11 "" "" "C"
#13 "" "" "C"
#14 "" "" "C"