如何把一个data.frame解析成一棵树?
How to parse a data.frame into a tree?
这是一个简单的分类法(标签和 ID):
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
解析后的结构应匹配以下内容:
list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197), Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
也许不是最有效的,但也不算太难:
创建数据:
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
遍历数据框并构建 list/tree:
tax <- list() ## initialize
for (i in 1:nrow(test_data)) {
## convert data.frame row to character vector
taxdat <- sapply(test_data[i,-1],as.character)
taxstr <- character(0) ## initialize taxon string
ntax <- length(na.omit(taxdat))
for (j in 1:ntax) {
taxstr <- c(taxstr,taxdat[j]) ## build string
if (is.null(tax[[taxstr]])) {
tax[[taxstr]] <- list() ## initialize if necessary
}
}
tax[[taxstr]] <- test_data$cat_id[i] ## assign value to tip
}
将结果与期望值进行比较:
res <- list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197),
Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
all.equal(res,tax) ## TRUE
我会避免使用列表结构而不是整理数据。这是一种减少数据冗余的方法。
library(dplyr)
h1_h2 =
test_data %>%
select(cat_h1, cat_h2) %>%
distinct %>%
filter(cat_h2 %>% is.na %>% `!`)
h2_h3 =
test_data %>%
select(cat_h2, cat_h3) %>%
distinct %>%
filter(cat_h3 %>% is.na %>% `!`)
h3_h4 =
test_data %>%
select(cat_h3, cat_h4) %>%
distinct %>%
filter(cat_h4 %>% is.na %>% `!`)
原来的很容易重构:
h1_h2 %>%
left_join(h2_h3) %>%
left_join(h3_h4)
编辑:这是一种自动化整个过程的方法。
library(dplyr)
library(lazyeval)
adjacency = function(data) {
adjacency_table = function(data, larger_name, smaller_name)
lazy(data %>%
select(larger_name, smaller_name) %>%
distinct %>%
filter(smaller_name %>% is.na %>% `!`) ) %>%
interp(larger_name = larger_name %>% as.name,
smaller_name = smaller_name %>% as.name) %>%
lazy_eval %>%
setNames(c("larger", "smaller"))
data_frame(smaller_name = data %>% names) %>%
mutate(larger_name = smaller_name %>% lag) %>%
slice(-1) %>%
group_by(larger_name, smaller_name) %>%
do(adjacency_table(data, .$larger_name, .$smaller_name) )
}
result =
test_data %>%
select(-cat_id) %>%
adjacency
如果您可以接受顺序的轻微变化,这是一个按列处理的递归解决方案:
f <- function(x, d=cbind(x,NA)) {
c(
# call f by branch
if(ncol(d) > 3) local({
x <- d[!is.na(d[[3]]),]
by( x[-2], droplevels(x[2]), f, x=NA, simplify=FALSE)
}),
# leaf nodes
setNames(as.list(d[[1]]), d[[2]])[is.na(d[[3]])]
)
}
这将给出:
> str(f(test_data))
List of 2
$ Animals:List of 2
..$ Birds :List of 2
.. ..$ Hawks: num 650
.. ..$ Other: num 126
..$ Mammals:List of 2
.. ..$ Dogs :List of 2
.. .. ..$ Big : num 661
.. .. ..$ Little: num 197
.. ..$ Other: num 228
$ Plants :List of 3
..$ Wheat: num 912
..$ Grass: num 949
..$ Other: num 428
这是一个简单的分类法(标签和 ID):
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
解析后的结构应匹配以下内容:
list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197), Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
也许不是最有效的,但也不算太难:
创建数据:
test_data <- data.frame(
cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
cat_h4 = c("Big", "Little", rep(NA, 6)))
遍历数据框并构建 list/tree:
tax <- list() ## initialize
for (i in 1:nrow(test_data)) {
## convert data.frame row to character vector
taxdat <- sapply(test_data[i,-1],as.character)
taxstr <- character(0) ## initialize taxon string
ntax <- length(na.omit(taxdat))
for (j in 1:ntax) {
taxstr <- c(taxstr,taxdat[j]) ## build string
if (is.null(tax[[taxstr]])) {
tax[[taxstr]] <- list() ## initialize if necessary
}
}
tax[[taxstr]] <- test_data$cat_id[i] ## assign value to tip
}
将结果与期望值进行比较:
res <- list(
Animals = list(Mammals = list(Dogs = list(Big = 661, Little = 197),
Other = 228),
Birds = list(Hawks = 650, Other = 126)),
Plants = list(Wheat = 912, Grass = 949, Other = 428))
all.equal(res,tax) ## TRUE
我会避免使用列表结构而不是整理数据。这是一种减少数据冗余的方法。
library(dplyr)
h1_h2 =
test_data %>%
select(cat_h1, cat_h2) %>%
distinct %>%
filter(cat_h2 %>% is.na %>% `!`)
h2_h3 =
test_data %>%
select(cat_h2, cat_h3) %>%
distinct %>%
filter(cat_h3 %>% is.na %>% `!`)
h3_h4 =
test_data %>%
select(cat_h3, cat_h4) %>%
distinct %>%
filter(cat_h4 %>% is.na %>% `!`)
原来的很容易重构:
h1_h2 %>%
left_join(h2_h3) %>%
left_join(h3_h4)
编辑:这是一种自动化整个过程的方法。
library(dplyr)
library(lazyeval)
adjacency = function(data) {
adjacency_table = function(data, larger_name, smaller_name)
lazy(data %>%
select(larger_name, smaller_name) %>%
distinct %>%
filter(smaller_name %>% is.na %>% `!`) ) %>%
interp(larger_name = larger_name %>% as.name,
smaller_name = smaller_name %>% as.name) %>%
lazy_eval %>%
setNames(c("larger", "smaller"))
data_frame(smaller_name = data %>% names) %>%
mutate(larger_name = smaller_name %>% lag) %>%
slice(-1) %>%
group_by(larger_name, smaller_name) %>%
do(adjacency_table(data, .$larger_name, .$smaller_name) )
}
result =
test_data %>%
select(-cat_id) %>%
adjacency
如果您可以接受顺序的轻微变化,这是一个按列处理的递归解决方案:
f <- function(x, d=cbind(x,NA)) {
c(
# call f by branch
if(ncol(d) > 3) local({
x <- d[!is.na(d[[3]]),]
by( x[-2], droplevels(x[2]), f, x=NA, simplify=FALSE)
}),
# leaf nodes
setNames(as.list(d[[1]]), d[[2]])[is.na(d[[3]])]
)
}
这将给出:
> str(f(test_data))
List of 2
$ Animals:List of 2
..$ Birds :List of 2
.. ..$ Hawks: num 650
.. ..$ Other: num 126
..$ Mammals:List of 2
.. ..$ Dogs :List of 2
.. .. ..$ Big : num 661
.. .. ..$ Little: num 197
.. ..$ Other: num 228
$ Plants :List of 3
..$ Wheat: num 912
..$ Grass: num 949
..$ Other: num 428