在 R 中解析这个结构化文本文件
Parsing This Structured Text File in R
我想解析下面的 attributes.txt 文件(来自 Sawtooth 调查研究的输出),以便生成的输出如下所示。你可以在下面看到我的尝试。有用。但它非常丑陋。一定有更好的方法,对吧? (如果可用的话,我更喜欢 tidyverse 解决方案)
attributes.txt:
================================================================================
ATTRIBUTES AND LEVELS
================================================================================
========================================
Display Text
========================================
<Same structure as shown below. But I do not want to extract any of this text>
========================================
Internal Labels
========================================
[Attribute List]:
1 brand
2 rating
3 price
---------------------------
Attribute 1:
brand
Levels:
1 brand01
2 brand02
3 brand03
4 otherbrand
---------------------------
Attribute 2:
rating
Levels:
1 1
2 2
3 3
4 4
5 5
---------------------------
Attribute 3:
price
Levels:
1 99
2 199
3 299
所需的解析输出:
attribute,level,label
1,1,brand01
1,2,brand02
1,3,brand03
1,4,otherbrand
2,1,1
2,2,2
2,3,3
2,4,4
2,5,5
3,1,99
3,2,199
3,3,299
我的尝试:
library(stringr)
parse_attributes_file <- function(ATTRIBUTES_FILE_PATH) {
con = file(ATTRIBUTES_FILE_PATH, "r")
reached_internal_labels <- FALSE
attribute_num <- NA
datalist <- list()
idx <- 0
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
if (!reached_internal_labels) {
reached_internal_labels <- str_detect(line, "Internal Labels")
} else {
attribute_num_extract <- str_match(line, "Attribute ([[:digit:]]+): ")[,2]
if(!is.na(attribute_num_extract)) {
attribute_num <- attribute_num_extract
} else {
if (!is.na(attribute_num)) {
my_match <- str_match(line, "([[:digit:]]+)\t(.*)")
if(!is.na(my_match[,1])) {
idx <- idx + 1
datalist[[idx]] <- c(attribute_num, my_match[,2], my_match[,3])
}
}
}
}
}
close(con)
attributes = do.call(rbind, datalist)
colnames(attributes) <- c("attribute", "level", "label")
return(attributes)
}
这里有一些代码可以使用 tidyverse
函数来完成同样的事情。首先,加载一些样本数据
# you'd do something like
# text <- readLines("yourtextfile")
# but for this sample...
text <- strsplit("================================================================================\nATTRIBUTES AND LEVELS\n================================================================================\n\n========================================\nDisplay Text\n========================================\n\n<Same structure as shown below. But I do not want to extract any of this text>\n\n========================================\nInternal Labels\n========================================\n\n[Attribute List]:\n\n1 brand\n2 rating\n3 price\n\n---------------------------\nAttribute 1: \nbrand\n\nLevels: \n1 brand01\n2 brand02\n3 brand03\n4 otherbrand\n\n---------------------------\nAttribute 2: \nrating\n\nLevels: \n1 1\n2 2\n3 3\n4 4\n5 5\n\n---------------------------\nAttribute 3: \nprice\n\nLevels: \n1 99\n2 199\n3 299", "\n")[[1]]
现在我们解析文件。首先,为每一行找到正确的属性
library(tidyverse)
attributes <- str_match(lines, "Attribute (\d)")[, 2] %>%
accumulate(function(a, b) coalesce(b,a))
然后通过查找带有 "Levels:" 的行并在空白行处停止来找到 "levels" 块
markers <- case_when(str_detect(lines, "^Levels:")~2,
str_detect(lines, "^$")~1,
TRUE~0)
levels <- markers %>% accumulate(function(a,b) case_when(b==2~TRUE, b==1~FALSE, TRUE~a), .init=FALSE) %>% head(-1) %>%
modify_if(markers==3, function(x) FALSE) %>% unlist
现在我们只需将属性和关卡数据组合成一个table,然后readr
将其解析成一个tibble
read_table(paste(attributes[levels], lines[levels], collapse="\n"),
col_names=c("attribute", "level", "label"))
那个returns
# A tibble: 12 x 3
attribute level label
<int> <int> <chr>
1 1 1 brand01
2 1 2 brand02
3 1 3 brand03
4 1 4 otherbrand
5 2 1 1
6 2 2 2
7 2 3 3
8 2 4 4
9 2 5 5
10 3 1 99
11 3 2 199
12 3 3 299
我想解析下面的 attributes.txt 文件(来自 Sawtooth 调查研究的输出),以便生成的输出如下所示。你可以在下面看到我的尝试。有用。但它非常丑陋。一定有更好的方法,对吧? (如果可用的话,我更喜欢 tidyverse 解决方案)
attributes.txt:
================================================================================
ATTRIBUTES AND LEVELS
================================================================================
========================================
Display Text
========================================
<Same structure as shown below. But I do not want to extract any of this text>
========================================
Internal Labels
========================================
[Attribute List]:
1 brand
2 rating
3 price
---------------------------
Attribute 1:
brand
Levels:
1 brand01
2 brand02
3 brand03
4 otherbrand
---------------------------
Attribute 2:
rating
Levels:
1 1
2 2
3 3
4 4
5 5
---------------------------
Attribute 3:
price
Levels:
1 99
2 199
3 299
所需的解析输出:
attribute,level,label
1,1,brand01
1,2,brand02
1,3,brand03
1,4,otherbrand
2,1,1
2,2,2
2,3,3
2,4,4
2,5,5
3,1,99
3,2,199
3,3,299
我的尝试:
library(stringr)
parse_attributes_file <- function(ATTRIBUTES_FILE_PATH) {
con = file(ATTRIBUTES_FILE_PATH, "r")
reached_internal_labels <- FALSE
attribute_num <- NA
datalist <- list()
idx <- 0
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
if (!reached_internal_labels) {
reached_internal_labels <- str_detect(line, "Internal Labels")
} else {
attribute_num_extract <- str_match(line, "Attribute ([[:digit:]]+): ")[,2]
if(!is.na(attribute_num_extract)) {
attribute_num <- attribute_num_extract
} else {
if (!is.na(attribute_num)) {
my_match <- str_match(line, "([[:digit:]]+)\t(.*)")
if(!is.na(my_match[,1])) {
idx <- idx + 1
datalist[[idx]] <- c(attribute_num, my_match[,2], my_match[,3])
}
}
}
}
}
close(con)
attributes = do.call(rbind, datalist)
colnames(attributes) <- c("attribute", "level", "label")
return(attributes)
}
这里有一些代码可以使用 tidyverse
函数来完成同样的事情。首先,加载一些样本数据
# you'd do something like
# text <- readLines("yourtextfile")
# but for this sample...
text <- strsplit("================================================================================\nATTRIBUTES AND LEVELS\n================================================================================\n\n========================================\nDisplay Text\n========================================\n\n<Same structure as shown below. But I do not want to extract any of this text>\n\n========================================\nInternal Labels\n========================================\n\n[Attribute List]:\n\n1 brand\n2 rating\n3 price\n\n---------------------------\nAttribute 1: \nbrand\n\nLevels: \n1 brand01\n2 brand02\n3 brand03\n4 otherbrand\n\n---------------------------\nAttribute 2: \nrating\n\nLevels: \n1 1\n2 2\n3 3\n4 4\n5 5\n\n---------------------------\nAttribute 3: \nprice\n\nLevels: \n1 99\n2 199\n3 299", "\n")[[1]]
现在我们解析文件。首先,为每一行找到正确的属性
library(tidyverse)
attributes <- str_match(lines, "Attribute (\d)")[, 2] %>%
accumulate(function(a, b) coalesce(b,a))
然后通过查找带有 "Levels:" 的行并在空白行处停止来找到 "levels" 块
markers <- case_when(str_detect(lines, "^Levels:")~2,
str_detect(lines, "^$")~1,
TRUE~0)
levels <- markers %>% accumulate(function(a,b) case_when(b==2~TRUE, b==1~FALSE, TRUE~a), .init=FALSE) %>% head(-1) %>%
modify_if(markers==3, function(x) FALSE) %>% unlist
现在我们只需将属性和关卡数据组合成一个table,然后readr
将其解析成一个tibble
read_table(paste(attributes[levels], lines[levels], collapse="\n"),
col_names=c("attribute", "level", "label"))
那个returns
# A tibble: 12 x 3
attribute level label
<int> <int> <chr>
1 1 1 brand01
2 1 2 brand02
3 1 3 brand03
4 1 4 otherbrand
5 2 1 1
6 2 2 2
7 2 3 3
8 2 4 4
9 2 5 5
10 3 1 99
11 3 2 199
12 3 3 299