将行拆分为 R 中的列

Split rows to columns in R

我有一个这样的文本数据文件(多行对应类别 [A,B,C]):

A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14

当我加载到 R 时,它变成了一个有 6 个 obs 的数据框。和 1 个变量

df <- read.delim("file.text",header = FALSE)

    v1
1   A=1,2,3,6,
2   7,9
3   10
4   B=3,4,5
5   C=5,7,8,10,11,
6   13,14

如何更改为这种格式?

   A  B  C
1  A
2  A
3  A  B
4     B
5     B  C
6  A
7  A     C
8        C
9  A
10 A     C
11       C
13       C
14       C

谢谢!

基本方法是将数据作为字符串读取,拆分为 =,,然后找出识别每个数字属于哪个组的最佳方法。

在下面的方法中,我有意使用类型转换来创建组。由于这将涉及从字符到数字的强制转换,因为拆分值中有字符,所以预计会有几条警告消息。

# Load the data.table package.
library(data.table)

# Read in the data.
x <- fread("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14", sep = "\n", header = FALSE)

x[, unlist(strsplit(V1, "=|,"), use.names = FALSE, recursive = FALSE), .I][
  , list(ind = as.integer(V1), col = rep(V1[1], .N)), cumsum(is.na(as.integer(V1)))][
    , dcast(na.omit(.SD), ind ~ col, value.var = "col", fill = "")]
#     ind A B C
#  1:   1 A    
#  2:   2 A    
#  3:   3 A B  
#  4:   4   B  
#  5:   5   B C
#  6:   6 A    
#  7:   7 A   C
#  8:   8     C
#  9:   9 A    
# 10:  10 A   C
# 11:  11     C
# 12:  13     C
# 13:  14     C

这是另一个使用我的 "splitstackshape" 包中的 cSplit 的替代方法。 "x" 与使用 fread.

读取的数据相同
library(splitstackshape)
cSplit(
  cSplit(x[, toString(V1), cumsum(grepl("[A-Z]", V1))], "V1", "="), "V1_2", ",", "long")[
    , dcast(.SD, V1_2 ~ V1_1, value.var = "V1_1", fill = "")]

使用 readLines() 的基础 R 方法。我们首先将以字符开头的单元格索引存储在向量 ch 中,这有助于找到 paste 的序列一起进入 l2l2 as.numeric 已经给出了最终数据帧的索引 d。我们从 l3 的维度构造 NANA 并根据 l3 的值填充它。

l <- readLines("delim.txt")
ch <- c(grep("^\D", l))
l2 <- apply(rbind(ch, c(ch[-1] - 1, length(l))), 2, 
            function(x) Reduce(paste, l[x[1]:x[2]]))
l3 <- lapply(sapply(strsplit(l2, "\D"), as.numeric), na.omit)
d <- matrix(NA, max(sapply(l3, max)), length(l3))
let <- gsub("(.*)\=.*", "\1", l2)  # the col-names
sapply(seq(length(l3)), function(n) d[l3[[n]], n] <<- let[n])
setNames(as.data.frame(d), let)
#      A    B    C
# 1     A <NA> <NA>
# 2     A <NA> <NA>
# 3     A    B <NA>
# 4  <NA>    B <NA>
# 5  <NA>    B    C
# 6     A <NA> <NA>
# 7     A <NA>    C
# 8  <NA> <NA>    C
# 9     A <NA> <NA>
# 10    A <NA>    C
# 11 <NA> <NA>    C
# 12 <NA> <NA> <NA>
# 13 <NA> <NA>    C
# 14 <NA> <NA>    C

这是一个tidyverse方法。

library(dplyr)
library(tidyr)

df %>%
  #Remove commas at the end of the string
  mutate(V1 = sub(",$", "", V1)) %>%
  #Divide data into separate column based on "=" sign
  separate(V1, into = c("let", "num"), sep = "=", fill = "left") %>%
  #fill the NA values with it's respective group
  fill(let) %>%
  #Convert comma-separated value into different rows
  separate_rows(num, convert = TRUE) %>%
  #Convert data to wide format filling missing values with empty string
  pivot_wider(names_from = let, values_from = let, values_fill = list(let = "")) %>%
  #Arrange data according to numbers
  arrange(num)

#  # A tibble: 13 x 4
#     num A     B     C    
#   <int> <chr> <chr> <chr>
# 1     1 A     ""    ""   
# 2     2 A     ""    ""   
# 3     3 A     B     ""   
# 4     4 ""    B     ""   
# 5     5 ""    B     C    
# 6     6 A     ""    ""   
# 7     7 A     ""    C    
# 8     8 ""    ""    C    
# 9     9 A     ""    ""   
#10    10 A     ""    C    
#11    11 ""    ""    C    
#12    13 ""    ""    C    
#13    14 ""    ""    C  

如果您希望 num 列作为行名,请添加 %>% column_to_rownames('num').

数据

df <- read.table(text = "A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14")

base 中你可以 paste 行到一行,使用 strsplit 获取单个数字和列名,创建一个命名的 matrix 并使用子集填充它。

x <- readLines(con=textConnection("A=1,2,3,6,
7,9
10
B=3,4,5
C=5,7,8,10,11,
13,14"))

x <- paste(gsub("=","",x), collapse = ",")
y <- lapply(strsplit(unlist(strsplit(x, "[[:alpha:]]+"))[-1], ","), function(i) i[!i==""])
names(y)  <- strsplit(x, "[^[:alpha:]]+")[[1]]
z <- sort(unique(as.numeric(unlist(y))))
res <- matrix("", nrow=length(z), ncol=length(y), dimnames=list(z, names(y)))
sapply(names(y), function(i) res[y[[i]], i]  <<- i)
res
#   A   B   C  
#1  "A" ""  "" 
#2  "A" ""  "" 
#3  "A" "B" "" 
#4  ""  "B" "" 
#5  ""  "B" "C"
#6  "A" ""  "" 
#7  "A" ""  "C"
#8  ""  ""  "C"
#9  "A" ""  "" 
#10 "A" ""  "C"
#11 ""  ""  "C"
#13 ""  ""  "C"
#14 ""  ""  "C"