用于在一行中列出非零列的 dplyr 习语
dplyr idiom for listing nonzero columns in a row
假设我有一个稀疏数据集,其中大多数值为零,但有些更大。我如何使用 dplyr
创建一个列来告诉我哪些列名不为零?
举个例子,
d <- replicate(n = 10,
expr = sample(x = c(rep(0, 80), runif(20)),
size = 10, replace = TRUE)) %>% as.data.frame()
names(d) <- LETTERS[1:10]
如果只有 A
、B
和 J
列在该特定行中具有非零值,则新列将包含包含 "A,B,J"
等值的行。
你可以试试
library(dplyr)
d %>%
rowwise() %>%
do(data.frame(., newcol= toString(names(d)[which(.!=0)])))%>%
as.data.frame()
在 base R
中,可以使用
来完成
d$newCol <- apply(d!=0, 1, function(x) toString(names(d)[x]))
更新
如果您需要根据值对字符串中的名称重新排序
res <- d %>%
rowwise() %>%
do(data.frame(.,
newcol= toString(names(d)[which(.!=0)[order(unlist(.[.!=0]))]])))%>%
as.data.frame()
res$newcol
#[1] "G, J" "J, E, H, C, A" "I" "G"
#[5] "D" "A, B, I" "C" "H, J, G"
#[9] "F, I" "H, A"
与
相同
apply(d, 1, function(x) toString(names(which(x!=0)[order(x[x!=0])])))
#[1] "G, J" "J, E, H, C, A" "I" "G"
#[5] "D" "A, B, I" "C" "H, J, G"
#[9] "F, I" "H, A"
d[1:2,]
# A B C D E F G H I J
#1 0.0000000 0 0.0000000 0 0.0000000 0 0.03736996 0.0000000 0 0.6297454
#2 0.9833502 0 0.7856878 0 0.2405307 0 0.00000000 0.5693827 0 0.1462821
这应该比逐行操作快得多:
library(data.table)
dt = as.data.table(d) # or setDT to convert in place
res = as.data.table(which(dt != 0, arr.ind = T))[, paste(names(dt)[col], collapse = ",")
, by = row]
dt[res$row, newcol := res$V1]
dt
# A B C D E F G H I J newcol
# 1: 0.0000000 0.20367933 0.0000000 0.2839109 0.0000000 0.0000000 0.0000000 0.00000000 0 0 B,D
# 2: 0.6784289 0.05622799 0.8293551 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000 0 0 A,B,C
# 3: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.1702222 0.0000000 0.00000000 0 0 F
# 4: 0.9259111 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.9930633 0.00000000 0 0 A,G
# 5: 0.4656185 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.7986647 0.30571672 0 0 A,G,H
# 6: 0.0000000 0.00000000 0.2893704 0.0000000 0.1617378 0.0000000 0.9861549 0.06057971 0 0 C,E,G,H
# 7: 0.0000000 0.00000000 0.1914990 0.0000000 0.5911986 0.0000000 0.0000000 0.00000000 0 0 C,E
# 8: 0.6359290 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.42127388 0 0 A,H
# 9: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.4898069 0.00000000 0 0 G
#10: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000 0 0 NA
将想法转换为 dplyr
,如果您愿意,应该很简单,并作为练习留给 reader。
假设我有一个稀疏数据集,其中大多数值为零,但有些更大。我如何使用 dplyr
创建一个列来告诉我哪些列名不为零?
举个例子,
d <- replicate(n = 10,
expr = sample(x = c(rep(0, 80), runif(20)),
size = 10, replace = TRUE)) %>% as.data.frame()
names(d) <- LETTERS[1:10]
如果只有 A
、B
和 J
列在该特定行中具有非零值,则新列将包含包含 "A,B,J"
等值的行。
你可以试试
library(dplyr)
d %>%
rowwise() %>%
do(data.frame(., newcol= toString(names(d)[which(.!=0)])))%>%
as.data.frame()
在 base R
中,可以使用
d$newCol <- apply(d!=0, 1, function(x) toString(names(d)[x]))
更新
如果您需要根据值对字符串中的名称重新排序
res <- d %>%
rowwise() %>%
do(data.frame(.,
newcol= toString(names(d)[which(.!=0)[order(unlist(.[.!=0]))]])))%>%
as.data.frame()
res$newcol
#[1] "G, J" "J, E, H, C, A" "I" "G"
#[5] "D" "A, B, I" "C" "H, J, G"
#[9] "F, I" "H, A"
与
相同 apply(d, 1, function(x) toString(names(which(x!=0)[order(x[x!=0])])))
#[1] "G, J" "J, E, H, C, A" "I" "G"
#[5] "D" "A, B, I" "C" "H, J, G"
#[9] "F, I" "H, A"
d[1:2,]
# A B C D E F G H I J
#1 0.0000000 0 0.0000000 0 0.0000000 0 0.03736996 0.0000000 0 0.6297454
#2 0.9833502 0 0.7856878 0 0.2405307 0 0.00000000 0.5693827 0 0.1462821
这应该比逐行操作快得多:
library(data.table)
dt = as.data.table(d) # or setDT to convert in place
res = as.data.table(which(dt != 0, arr.ind = T))[, paste(names(dt)[col], collapse = ",")
, by = row]
dt[res$row, newcol := res$V1]
dt
# A B C D E F G H I J newcol
# 1: 0.0000000 0.20367933 0.0000000 0.2839109 0.0000000 0.0000000 0.0000000 0.00000000 0 0 B,D
# 2: 0.6784289 0.05622799 0.8293551 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000 0 0 A,B,C
# 3: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.1702222 0.0000000 0.00000000 0 0 F
# 4: 0.9259111 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.9930633 0.00000000 0 0 A,G
# 5: 0.4656185 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.7986647 0.30571672 0 0 A,G,H
# 6: 0.0000000 0.00000000 0.2893704 0.0000000 0.1617378 0.0000000 0.9861549 0.06057971 0 0 C,E,G,H
# 7: 0.0000000 0.00000000 0.1914990 0.0000000 0.5911986 0.0000000 0.0000000 0.00000000 0 0 C,E
# 8: 0.6359290 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.42127388 0 0 A,H
# 9: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.4898069 0.00000000 0 0 G
#10: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000 0 0 NA
将想法转换为 dplyr
,如果您愿意,应该很简单,并作为练习留给 reader。