解析混乱的数据帧以重塑数据

Parsing messy dataframes to reshaped data

我有一个看起来像这样的 df:

. <- c("AXX 101", "", "Introduction to AXX", " ", "Prereq: BXX102, BXX101, not open to CXX program",
       "Antireq: BXX103", "", "AXX 102","AXX Part II", "", "Antireq: BXX101", "", " ")
df <- data.frame(.)
df
                                                 .
1                                          AXX 101
2                                                 
3                              Introduction to AXX
4                                                 
5  Prereq: BXX102, BXX101, not open to CXX program
6                                  Antireq: BXX103
7                                                 
8                                          AXX 102
9                                      AXX Part II
10                                                
11                                 Antireq: BXX101
12                                                
13                                                

我想将数据框的这个可悲的借口解析为这样的东西:

title    prereq                                   antireq 
AXX101   BXX102, BXX101, not open to CXX program  BXX103
AXX102                                            BXX101 

一个选项是,(这里我用 V1 替换了 . 作为列名

res <-  do.call(rbind,
          lapply(split(df,cumsum(grepl('AXX \d+', df$V1))), function(x) {
         x1 <- x$V1[grep('^(AXX \d+|Prereq|Antireq)', x$V1)]
         x2 <- sub(':? .*', '', x1)
         x3 <- sub('.*: ', '', x1[match(c('AXX', 'Prereq', 'Antireq'), x2)], perl=TRUE)
        data.frame(title=x3[1], prereq=x3[2], antireq=x3[3])}))
res 
#    title                                  prereq antireq
#1 AXX 101 BXX102, BXX101, not open to CXX program  BXX103
#2 AXX 102                                    <NA>  BXX101

也许您可以尝试以下操作:

library(splitstackshape)
library(dplyr)
library(zoo)
library(tidyr)

cSplit(df, "V1", ":") %>%
  .[, V2 := ifelse(grepl("[0-9]$", V1_1), as.character(V1_1), NA)] %>%
  .[, V2 := na.locf(V2)] %>%
  .[V1_1 %in% c("Prereq", "Antireq")] %>%
  spread(V1_1, V1_2)
#         V2 Antireq                                  Prereq
# 1: AXX 101  BXX103 BXX102, BXX101, not open to CXX program
# 2: AXX 102  BXX101                                      NA

在每个 AXX <number> 行前加上 \nTitle:,在 select 行前加上冒号,然后用 read.dcf 读取结果。如果每个列名称的首字母大写都可以,则可以省略标记为 ## 的行。不需要软件包:

s <- as.character(df[[1]])

ix <- grep("AXX \d", s)
s[ix] <- paste("\nTitle:", s[ix])
s <- grep(":", s, value = TRUE)

out <- read.dcf(textConnection(s))
colnames(out) <- tolower(colnames(out)) ##

给予:

> out
     title     prereq                                    antireq 
[1,] "AXX 101" "BXX102, BXX101, not open to CXX program" "BXX103"
[2,] "AXX 102" NA                                        "BXX101" 

更新: 一些简化。