解析混乱的数据帧以重塑数据
Parsing messy dataframes to reshaped data
我有一个看起来像这样的 df:
. <- c("AXX 101", "", "Introduction to AXX", " ", "Prereq: BXX102, BXX101, not open to CXX program",
"Antireq: BXX103", "", "AXX 102","AXX Part II", "", "Antireq: BXX101", "", " ")
df <- data.frame(.)
df
.
1 AXX 101
2
3 Introduction to AXX
4
5 Prereq: BXX102, BXX101, not open to CXX program
6 Antireq: BXX103
7
8 AXX 102
9 AXX Part II
10
11 Antireq: BXX101
12
13
我想将数据框的这个可悲的借口解析为这样的东西:
title prereq antireq
AXX101 BXX102, BXX101, not open to CXX program BXX103
AXX102 BXX101
一个选项是,(这里我用 V1
替换了 .
作为列名
res <- do.call(rbind,
lapply(split(df,cumsum(grepl('AXX \d+', df$V1))), function(x) {
x1 <- x$V1[grep('^(AXX \d+|Prereq|Antireq)', x$V1)]
x2 <- sub(':? .*', '', x1)
x3 <- sub('.*: ', '', x1[match(c('AXX', 'Prereq', 'Antireq'), x2)], perl=TRUE)
data.frame(title=x3[1], prereq=x3[2], antireq=x3[3])}))
res
# title prereq antireq
#1 AXX 101 BXX102, BXX101, not open to CXX program BXX103
#2 AXX 102 <NA> BXX101
也许您可以尝试以下操作:
library(splitstackshape)
library(dplyr)
library(zoo)
library(tidyr)
cSplit(df, "V1", ":") %>%
.[, V2 := ifelse(grepl("[0-9]$", V1_1), as.character(V1_1), NA)] %>%
.[, V2 := na.locf(V2)] %>%
.[V1_1 %in% c("Prereq", "Antireq")] %>%
spread(V1_1, V1_2)
# V2 Antireq Prereq
# 1: AXX 101 BXX103 BXX102, BXX101, not open to CXX program
# 2: AXX 102 BXX101 NA
在每个 AXX <number>
行前加上 \nTitle:
,在 select 行前加上冒号,然后用 read.dcf
读取结果。如果每个列名称的首字母大写都可以,则可以省略标记为 ##
的行。不需要软件包:
s <- as.character(df[[1]])
ix <- grep("AXX \d", s)
s[ix] <- paste("\nTitle:", s[ix])
s <- grep(":", s, value = TRUE)
out <- read.dcf(textConnection(s))
colnames(out) <- tolower(colnames(out)) ##
给予:
> out
title prereq antireq
[1,] "AXX 101" "BXX102, BXX101, not open to CXX program" "BXX103"
[2,] "AXX 102" NA "BXX101"
更新: 一些简化。
我有一个看起来像这样的 df:
. <- c("AXX 101", "", "Introduction to AXX", " ", "Prereq: BXX102, BXX101, not open to CXX program",
"Antireq: BXX103", "", "AXX 102","AXX Part II", "", "Antireq: BXX101", "", " ")
df <- data.frame(.)
df
.
1 AXX 101
2
3 Introduction to AXX
4
5 Prereq: BXX102, BXX101, not open to CXX program
6 Antireq: BXX103
7
8 AXX 102
9 AXX Part II
10
11 Antireq: BXX101
12
13
我想将数据框的这个可悲的借口解析为这样的东西:
title prereq antireq
AXX101 BXX102, BXX101, not open to CXX program BXX103
AXX102 BXX101
一个选项是,(这里我用 V1
替换了 .
作为列名
res <- do.call(rbind,
lapply(split(df,cumsum(grepl('AXX \d+', df$V1))), function(x) {
x1 <- x$V1[grep('^(AXX \d+|Prereq|Antireq)', x$V1)]
x2 <- sub(':? .*', '', x1)
x3 <- sub('.*: ', '', x1[match(c('AXX', 'Prereq', 'Antireq'), x2)], perl=TRUE)
data.frame(title=x3[1], prereq=x3[2], antireq=x3[3])}))
res
# title prereq antireq
#1 AXX 101 BXX102, BXX101, not open to CXX program BXX103
#2 AXX 102 <NA> BXX101
也许您可以尝试以下操作:
library(splitstackshape)
library(dplyr)
library(zoo)
library(tidyr)
cSplit(df, "V1", ":") %>%
.[, V2 := ifelse(grepl("[0-9]$", V1_1), as.character(V1_1), NA)] %>%
.[, V2 := na.locf(V2)] %>%
.[V1_1 %in% c("Prereq", "Antireq")] %>%
spread(V1_1, V1_2)
# V2 Antireq Prereq
# 1: AXX 101 BXX103 BXX102, BXX101, not open to CXX program
# 2: AXX 102 BXX101 NA
在每个 AXX <number>
行前加上 \nTitle:
,在 select 行前加上冒号,然后用 read.dcf
读取结果。如果每个列名称的首字母大写都可以,则可以省略标记为 ##
的行。不需要软件包:
s <- as.character(df[[1]])
ix <- grep("AXX \d", s)
s[ix] <- paste("\nTitle:", s[ix])
s <- grep(":", s, value = TRUE)
out <- read.dcf(textConnection(s))
colnames(out) <- tolower(colnames(out)) ##
给予:
> out
title prereq antireq
[1,] "AXX 101" "BXX102, BXX101, not open to CXX program" "BXX103"
[2,] "AXX 102" NA "BXX101"
更新: 一些简化。