R 中的交叉表与 data.tables

Cross-tab in R with data.tables

抱歉,如果有人问过这个问题,我玩玩具数据来学习操作data.tables。我的目标来自这些数据:

toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
                        to=c("B","C","A","D","F","E","E","A","A","A","C",NA))

得出这个结果:

final_matrix
     L    A    B    C    D    E    F
1:   A    3    1    2 <NA>    1 <NA>
2:   B    1    0 <NA> <NA> <NA> <NA>
3:   C    2 <NA>    0    1    1 <NA>
4:   D <NA> <NA>    1    0 <NA> <NA>
5:   E    1 <NA>    1 <NA>    1    1
6:   F <NA> <NA> <NA> <NA>    1    0
7: tot    7    1    4    1    4    1

(最终也用零代替 NA,但感到无聊)。我想在 STATA 中这将是一个简单的交叉表,我已经构建了一个函数,然后遍历 cols 中的唯一值(叹气:/)合并表格,然后添加带有总计的最后一行。现在虽然我学到了很多东西,但我想知道获得此类交叉表的干净 R 方法是什么?因为以下不起作用:

table(toy_data$from,toy_data$to)
   
    A B C D E F
  A 3 1 1 0 1 0
  C 1 0 0 1 0 0
  E 0 0 1 0 1 1

谢谢。我的功能如果你有一般的改进或最佳实践我非常高兴:

create_edge_cols<- function(dt,column){
  #this function takes a df and a column, 
  #computes the number of edges among this column and all the other in dt
  #returns a column (list) with the cross-tabulation of columns
  tot_edges_i = dim(dt[from==column|to==column][,.(to=na.omit(to))])[1] # E better! without NAs
  print(tot_edges_i)
  # now tabulate links of column
  tab = data.table(table(unlist(dt[(from==column&to!=column)|
                                           (from!=column&to==column)])))
  setnames(tab, "V1", "L")
  setnames(tab, "N", column)
  setorder(tab,"L")
  tab[L==column,column] = length(dt[to==column & to == from,from])
  #tab[,`:=`(L=L,column=column/as.numeric(tot_edges_i))]
  return(tab)
}

#this should be the first column of our table
first_column = data.table("L"=unique(toy_data[,c(to[!is.na(to)],from)]))

#loop through the values of the columns and merge to a unique df
for (col in sort(unique(toy_data[!is.na(to),c(to,from)]))){
  info_column = copy(create_edge_cols(toy_data,col))
  first_column = merge.data.table(first_column,info_column,all.x = TRUE,all.y = TRUE)
}

## function to set first row as name
header.true <- function(df) {
  names(df) <- as.character(unlist(df[1,]))
  df[-1,]
}
# this should be the last row of our matrix:
last_row = transpose(data.table(table(unlist(toy_data[!is.na(toy_data$to),c(from,to[to!=from])]))))
last_row = cbind(data.table(matrix(c("L","tot"), ncol=1)),last_row)
last_row = header.true(last_row)
last_row

# let's concatenate
final_matrix = rbind(first_column,last_row)
final_matrix

编辑:先前答案建议的解决方案现已删除:

library(igraph)
g <- graph_from_data_frame(na.omit(toy_data), directed = F)
am <- as_adjacency_matrix(g, type = "both")
addmargins(as.matrix(am[order(rownames(am)), order(colnames(am))]), 1)

这是一个方法。问题的 table 语句中缺少的是因子水平,table 仅处理数据中的内容。将列强制转换为具有相同水平的因子,并将 NA 分配给等于零的计数。

还有一个print问题,看最后两条说明。 S# class "table" 方法打印的默认设置是不打印 NA 的。这可以手动更改。

library(data.table)

toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
                      to=c("B","C","A","D","F","E","E","A","A","A","C",NA))

levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)
is.na(tbl) <- tbl == 0
tbl
#>     to
#> from  A  B  C  D  E  F
#>    A  3  1  1     1   
#>    B                  
#>    C  1        1      
#>    D                  
#>    E        1     1  1
#>    F

print(tbl, na.print = NA)
#>     to
#> from    A    B    C    D    E    F
#>    A    3    1    1 <NA>    1 <NA>
#>    B <NA> <NA> <NA> <NA> <NA> <NA>
#>    C    1 <NA> <NA>    1 <NA> <NA>
#>    D <NA> <NA> <NA> <NA> <NA> <NA>
#>    E <NA> <NA>    1 <NA>    1    1
#>    F <NA> <NA> <NA> <NA> <NA> <NA>

reprex package (v2.0.1)

于 2022-03-28 创建

编辑

要在交叉 table 的底部添加列总和行,rbind 上面的结果用 colSums。请注意,不再需要 print(tbl, na.print = NA),调用的方法 print(自动打印)现在是矩阵方法。

library(data.table)

toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
                      to=c("B","C","A","D","F","E","E","A","A","A","C",NA))

levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)

class(tbl)  # check the output object class
#> [1] "table"

tbl <- rbind(tbl, tot = colSums(tbl, na.rm = TRUE))
is.na(tbl) <- tbl == 0

class(tbl)  # check the output object class, it's no longer "table"
#> [1] "matrix" "array"

tbl
#>      A  B  C  D  E  F
#> A    3  1  1 NA  1 NA
#> B   NA NA NA NA NA NA
#> C    1 NA NA  1 NA NA
#> D   NA NA NA NA NA NA
#> E   NA NA  1 NA  1  1
#> F   NA NA NA NA NA NA
#> tot  4  1  2  1  2  1

reprex package (v2.0.1)

创建于 2022-03-29