如何把一个data.frame解析成一棵树？

Question

这是一个简单的分类法（标签和 ID）：

test_data <- data.frame(
  cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
  cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
  cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
  cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
  cat_h4 = c("Big", "Little", rep(NA, 6)))

解析后的结构应匹配以下内容：

list(
  Animals = list(Mammals = list(Dogs  = list(Big = 661, Little = 197), Other = 228),
                 Birds   = list(Hawks = 650, Other = 126)),
  Plants  = list(Wheat = 912, Grass = 949, Other = 428))

Answer 1

也许不是最有效的，但也不算太难：

创建数据：

test_data <- data.frame(
  cat_id = c(661, 197, 228, 650, 126, 912, 949, 428),
  cat_h1 = c(rep("Animals", 5), rep("Plants", 3)),
  cat_h2 = c(rep("Mammals", 3), rep("Birds", 2), c("Wheat", "Grass", "Other")),
  cat_h3 = c("Dogs", "Dogs", "Other", "Hawks", "Other", rep(NA, 3)),
  cat_h4 = c("Big", "Little", rep(NA, 6)))

遍历数据框并构建 list/tree:

tax <- list()  ## initialize
for (i in 1:nrow(test_data)) {
    ## convert data.frame row to character vector
    taxdat <- sapply(test_data[i,-1],as.character)
    taxstr <- character(0)  ## initialize taxon string
    ntax <- length(na.omit(taxdat))
    for (j in 1:ntax) {
        taxstr <- c(taxstr,taxdat[j])  ## build string
        if (is.null(tax[[taxstr]])) {
            tax[[taxstr]] <- list()  ## initialize if necessary
        }
    }
    tax[[taxstr]] <- test_data$cat_id[i]  ## assign value to tip
}

将结果与期望值进行比较：

res <- list(
  Animals = list(Mammals = list(Dogs  = list(Big = 661, Little = 197),
                 Other = 228),
                 Birds   = list(Hawks = 650, Other = 126)),
  Plants  = list(Wheat = 912, Grass = 949, Other = 428))

all.equal(res,tax)  ## TRUE

Answer 2

我会避免使用列表结构而不是整理数据。这是一种减少数据冗余的方法。

library(dplyr)

h1_h2 = 
  test_data %>%
  select(cat_h1, cat_h2) %>%
  distinct %>%
  filter(cat_h2 %>% is.na %>% `!`)

h2_h3 =
  test_data %>%
  select(cat_h2, cat_h3) %>%
  distinct %>%
  filter(cat_h3 %>% is.na %>% `!`)

h3_h4 = 
  test_data %>%
  select(cat_h3, cat_h4) %>%
  distinct %>%
  filter(cat_h4 %>% is.na %>% `!`)

原来的很容易重构：

h1_h2 %>%
  left_join(h2_h3) %>%
  left_join(h3_h4)

编辑：这是一种自动化整个过程的方法。

library(dplyr)
library(lazyeval)

adjacency = function(data) {
  adjacency_table = function(data, larger_name, smaller_name)
    lazy(data %>%
           select(larger_name, smaller_name) %>%
           distinct %>%
           filter(smaller_name %>% is.na %>% `!`) ) %>%
    interp(larger_name = larger_name %>% as.name, 
           smaller_name = smaller_name %>% as.name) %>%
    lazy_eval %>%
    setNames(c("larger", "smaller"))

  data_frame(smaller_name = data %>% names) %>%
    mutate(larger_name = smaller_name %>% lag) %>%
    slice(-1) %>%
    group_by(larger_name, smaller_name) %>%
    do(adjacency_table(data, .$larger_name, .$smaller_name) )
}

result = 
  test_data %>%
  select(-cat_id) %>%
  adjacency

Answer 3

如果您可以接受顺序的轻微变化，这是一个按列处理的递归解决方案：

f <- function(x, d=cbind(x,NA)) {
    c( 
       # call f by branch
       if(ncol(d) > 3) local({
         x <- d[!is.na(d[[3]]),] 
         by( x[-2], droplevels(x[2]), f, x=NA, simplify=FALSE) 
       }), 
       # leaf nodes
       setNames(as.list(d[[1]]), d[[2]])[is.na(d[[3]])] 
    )
}

这将给出：

> str(f(test_data))
List of 2
 $ Animals:List of 2
  ..$ Birds  :List of 2
  .. ..$ Hawks: num 650
  .. ..$ Other: num 126
  ..$ Mammals:List of 2
  .. ..$ Dogs :List of 2
  .. .. ..$ Big   : num 661
  .. .. ..$ Little: num 197
  .. ..$ Other: num 228
 $ Plants :List of 3
  ..$ Wheat: num 912
  ..$ Grass: num 949
  ..$ Other: num 428

如何把一个data.frame解析成一棵树？

How to parse a data.frame into a tree?

tree

parsing

r