R 中的交叉表与 data.tables
Cross-tab in R with data.tables
抱歉,如果有人问过这个问题,我玩玩具数据来学习操作data.tables。我的目标来自这些数据:
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
得出这个结果:
final_matrix
L A B C D E F
1: A 3 1 2 <NA> 1 <NA>
2: B 1 0 <NA> <NA> <NA> <NA>
3: C 2 <NA> 0 1 1 <NA>
4: D <NA> <NA> 1 0 <NA> <NA>
5: E 1 <NA> 1 <NA> 1 1
6: F <NA> <NA> <NA> <NA> 1 0
7: tot 7 1 4 1 4 1
(最终也用零代替 NA,但感到无聊)。我想在 STATA 中这将是一个简单的交叉表,我已经构建了一个函数,然后遍历 cols 中的唯一值(叹气:/)合并表格,然后添加带有总计的最后一行。现在虽然我学到了很多东西,但我想知道获得此类交叉表的干净 R 方法是什么?因为以下不起作用:
table(toy_data$from,toy_data$to)
A B C D E F
A 3 1 1 0 1 0
C 1 0 0 1 0 0
E 0 0 1 0 1 1
谢谢。我的功能如果你有一般的改进或最佳实践我非常高兴:
create_edge_cols<- function(dt,column){
#this function takes a df and a column,
#computes the number of edges among this column and all the other in dt
#returns a column (list) with the cross-tabulation of columns
tot_edges_i = dim(dt[from==column|to==column][,.(to=na.omit(to))])[1] # E better! without NAs
print(tot_edges_i)
# now tabulate links of column
tab = data.table(table(unlist(dt[(from==column&to!=column)|
(from!=column&to==column)])))
setnames(tab, "V1", "L")
setnames(tab, "N", column)
setorder(tab,"L")
tab[L==column,column] = length(dt[to==column & to == from,from])
#tab[,`:=`(L=L,column=column/as.numeric(tot_edges_i))]
return(tab)
}
#this should be the first column of our table
first_column = data.table("L"=unique(toy_data[,c(to[!is.na(to)],from)]))
#loop through the values of the columns and merge to a unique df
for (col in sort(unique(toy_data[!is.na(to),c(to,from)]))){
info_column = copy(create_edge_cols(toy_data,col))
first_column = merge.data.table(first_column,info_column,all.x = TRUE,all.y = TRUE)
}
## function to set first row as name
header.true <- function(df) {
names(df) <- as.character(unlist(df[1,]))
df[-1,]
}
# this should be the last row of our matrix:
last_row = transpose(data.table(table(unlist(toy_data[!is.na(toy_data$to),c(from,to[to!=from])]))))
last_row = cbind(data.table(matrix(c("L","tot"), ncol=1)),last_row)
last_row = header.true(last_row)
last_row
# let's concatenate
final_matrix = rbind(first_column,last_row)
final_matrix
编辑:先前答案建议的解决方案现已删除:
library(igraph)
g <- graph_from_data_frame(na.omit(toy_data), directed = F)
am <- as_adjacency_matrix(g, type = "both")
addmargins(as.matrix(am[order(rownames(am)), order(colnames(am))]), 1)
这是一个方法。问题的 table
语句中缺少的是因子水平,table
仅处理数据中的内容。将列强制转换为具有相同水平的因子,并将 NA
分配给等于零的计数。
还有一个print
问题,看最后两条说明。 S# class "table"
方法打印的默认设置是不打印 NA
的。这可以手动更改。
library(data.table)
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)
is.na(tbl) <- tbl == 0
tbl
#> to
#> from A B C D E F
#> A 3 1 1 1
#> B
#> C 1 1
#> D
#> E 1 1 1
#> F
print(tbl, na.print = NA)
#> to
#> from A B C D E F
#> A 3 1 1 <NA> 1 <NA>
#> B <NA> <NA> <NA> <NA> <NA> <NA>
#> C 1 <NA> <NA> 1 <NA> <NA>
#> D <NA> <NA> <NA> <NA> <NA> <NA>
#> E <NA> <NA> 1 <NA> 1 1
#> F <NA> <NA> <NA> <NA> <NA> <NA>
由 reprex package (v2.0.1)
于 2022-03-28 创建
编辑
要在交叉 table 的底部添加列总和行,rbind
上面的结果用 colSums
。请注意,不再需要 print(tbl, na.print = NA)
,调用的方法 print
(自动打印)现在是矩阵方法。
library(data.table)
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)
class(tbl) # check the output object class
#> [1] "table"
tbl <- rbind(tbl, tot = colSums(tbl, na.rm = TRUE))
is.na(tbl) <- tbl == 0
class(tbl) # check the output object class, it's no longer "table"
#> [1] "matrix" "array"
tbl
#> A B C D E F
#> A 3 1 1 NA 1 NA
#> B NA NA NA NA NA NA
#> C 1 NA NA 1 NA NA
#> D NA NA NA NA NA NA
#> E NA NA 1 NA 1 1
#> F NA NA NA NA NA NA
#> tot 4 1 2 1 2 1
由 reprex package (v2.0.1)
创建于 2022-03-29
抱歉,如果有人问过这个问题,我玩玩具数据来学习操作data.tables。我的目标来自这些数据:
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
得出这个结果:
final_matrix
L A B C D E F
1: A 3 1 2 <NA> 1 <NA>
2: B 1 0 <NA> <NA> <NA> <NA>
3: C 2 <NA> 0 1 1 <NA>
4: D <NA> <NA> 1 0 <NA> <NA>
5: E 1 <NA> 1 <NA> 1 1
6: F <NA> <NA> <NA> <NA> 1 0
7: tot 7 1 4 1 4 1
(最终也用零代替 NA,但感到无聊)。我想在 STATA 中这将是一个简单的交叉表,我已经构建了一个函数,然后遍历 cols 中的唯一值(叹气:/)合并表格,然后添加带有总计的最后一行。现在虽然我学到了很多东西,但我想知道获得此类交叉表的干净 R 方法是什么?因为以下不起作用:
table(toy_data$from,toy_data$to)
A B C D E F
A 3 1 1 0 1 0
C 1 0 0 1 0 0
E 0 0 1 0 1 1
谢谢。我的功能如果你有一般的改进或最佳实践我非常高兴:
create_edge_cols<- function(dt,column){
#this function takes a df and a column,
#computes the number of edges among this column and all the other in dt
#returns a column (list) with the cross-tabulation of columns
tot_edges_i = dim(dt[from==column|to==column][,.(to=na.omit(to))])[1] # E better! without NAs
print(tot_edges_i)
# now tabulate links of column
tab = data.table(table(unlist(dt[(from==column&to!=column)|
(from!=column&to==column)])))
setnames(tab, "V1", "L")
setnames(tab, "N", column)
setorder(tab,"L")
tab[L==column,column] = length(dt[to==column & to == from,from])
#tab[,`:=`(L=L,column=column/as.numeric(tot_edges_i))]
return(tab)
}
#this should be the first column of our table
first_column = data.table("L"=unique(toy_data[,c(to[!is.na(to)],from)]))
#loop through the values of the columns and merge to a unique df
for (col in sort(unique(toy_data[!is.na(to),c(to,from)]))){
info_column = copy(create_edge_cols(toy_data,col))
first_column = merge.data.table(first_column,info_column,all.x = TRUE,all.y = TRUE)
}
## function to set first row as name
header.true <- function(df) {
names(df) <- as.character(unlist(df[1,]))
df[-1,]
}
# this should be the last row of our matrix:
last_row = transpose(data.table(table(unlist(toy_data[!is.na(toy_data$to),c(from,to[to!=from])]))))
last_row = cbind(data.table(matrix(c("L","tot"), ncol=1)),last_row)
last_row = header.true(last_row)
last_row
# let's concatenate
final_matrix = rbind(first_column,last_row)
final_matrix
编辑:先前答案建议的解决方案现已删除:
library(igraph)
g <- graph_from_data_frame(na.omit(toy_data), directed = F)
am <- as_adjacency_matrix(g, type = "both")
addmargins(as.matrix(am[order(rownames(am)), order(colnames(am))]), 1)
这是一个方法。问题的 table
语句中缺少的是因子水平,table
仅处理数据中的内容。将列强制转换为具有相同水平的因子,并将 NA
分配给等于零的计数。
还有一个print
问题,看最后两条说明。 S# class "table"
方法打印的默认设置是不打印 NA
的。这可以手动更改。
library(data.table)
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)
is.na(tbl) <- tbl == 0
tbl
#> to
#> from A B C D E F
#> A 3 1 1 1
#> B
#> C 1 1
#> D
#> E 1 1 1
#> F
print(tbl, na.print = NA)
#> to
#> from A B C D E F
#> A 3 1 1 <NA> 1 <NA>
#> B <NA> <NA> <NA> <NA> <NA> <NA>
#> C 1 <NA> <NA> 1 <NA> <NA>
#> D <NA> <NA> <NA> <NA> <NA> <NA>
#> E <NA> <NA> 1 <NA> 1 1
#> F <NA> <NA> <NA> <NA> <NA> <NA>
由 reprex package (v2.0.1)
于 2022-03-28 创建编辑
要在交叉 table 的底部添加列总和行,rbind
上面的结果用 colSums
。请注意,不再需要 print(tbl, na.print = NA)
,调用的方法 print
(自动打印)现在是矩阵方法。
library(data.table)
toy_data = data.table(from=c("A","A","A","C","E","E","A","A","A","C","E","E"),
to=c("B","C","A","D","F","E","E","A","A","A","C",NA))
levels <- sort(unique(unlist(toy_data)))
levels <- levels[!is.na(levels)]
toy_data[, c("from", "to") := lapply(.SD, factor, levels = levels)]
tbl <- table(toy_data)
class(tbl) # check the output object class
#> [1] "table"
tbl <- rbind(tbl, tot = colSums(tbl, na.rm = TRUE))
is.na(tbl) <- tbl == 0
class(tbl) # check the output object class, it's no longer "table"
#> [1] "matrix" "array"
tbl
#> A B C D E F
#> A 3 1 1 NA 1 NA
#> B NA NA NA NA NA NA
#> C 1 NA NA 1 NA NA
#> D NA NA NA NA NA NA
#> E NA NA 1 NA 1 1
#> F NA NA NA NA NA NA
#> tot 4 1 2 1 2 1
由 reprex package (v2.0.1)
创建于 2022-03-29