将数据框中的多列与外部向量进行比较

Compare multiple columns from a dataframe with an outer vector

假设我们有这个向量:

products <- c(a, b, d, f, g, h, i, j, m, o, t, z)

和下面这样的数据框:

seller_a seller_b seller_c
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w

我想在数据框中包含一个额外的行,它会指示每个 seller 列与 products 向量相匹配的范围。

换句话说,我的目标是让原始数据框看起来像这样:

seller_a seller_b seller_c
6        3        4
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w

数据:

df <- tibble(
  a = c("a", "d", "g", "h", "t", "z"),
  b = c("b", "d", "g", "l", "n", "y"),
  c = c("d", "e", "g", "h", "t", "w")
)

products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")

代码:

library(tidyverse)

df %>% rbind(map_int(., ~sum(products %in% .x)), .)

  a     b     c    
  <chr> <chr> <chr>
1 6     3     4    
2 a     b     d    
3 d     d     e    
4 g     g     g    
5 h     l     h    
6 t     n     t    
7 z     y     w 

请注意,第一行的数字将是字符。此外,如果列是因素代码将不起作用(这就是我使用 tibble

的原因

使用基础 R,您可以执行以下操作

#///////////////////
#your data
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
seller_a <- c("a", "d", "g", "h", "t", "z")
seller_b <- c("b", "d", "g", "l", "n", "y")
seller_c <- c("d", "e", "g", "h", "t", "w")
d <- as.data.frame(cbind(seller_a,seller_b,seller_c))
#///////////////////

a <- c(sum(d$seller_a %in% products), sum(d$seller_b %in% products), sum(d$seller_c %in% products))

d <- rbind(a,d)

data.table 的解决方案:

library(data.table)

products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")

DT <- data.table(
  seller_a = c("a", "d", "g", "h", "t", "z"),
  seller_b = c("b", "d", "g", "l", "n", "y"),
  seller_c = c("d", "e", "g", "h", "t", "w")
)

DT1 <- DT[,.(seller_a = length(which(products%in%seller_a==TRUE)),
             seller_b = length(which(products%in%seller_b==TRUE)),
             seller_c = length(which(products%in%seller_c==TRUE)))]

# -------------------

> DT1
   seller_a seller_b seller_c
1:        6        3        4
> rbind(DT1, DT)
   seller_a seller_b seller_c
1:        6        3        4
2:        a        b        d
3:        d        d        e
4:        g        g        g
5:        h        l        h
6:        t        n        t
7:        z        y        w

基础 R 选项 -

rbind(sapply(df, function(x) sum(x %in% products)), df)

#  a b c
#1 6 3 4
#2 a b d
#3 d d e
#4 g g g
#5 h l h
#6 t n t
#7 z y w

您还可以使用 tibble 中的 add_row 函数向您的数据集添加额外的行:

library(dplyr)

df %>%
  add_row(seller_a = as.character(sum(df$seller_a %in% products)),
          seller_b = as.character(sum(df$seller_b %in% products)),
          seller_c = as.character(sum(df$seller_c %in% products)),
          .before = 1)

# A tibble: 7 x 3
  seller_a seller_b seller_c
  <chr>    <chr>    <chr>   
1 6        3        4       
2 a        b        d       
3 d        d        e       
4 g        g        g       
5 h        l        h       
6 t        n        t       
7 z        y        w 

使用末尾注释中可重复显示的输入

as.data.frame(lapply(DF, function(x) c(sum(x %in% products), x)))
##   seller_a seller_b seller_c
## 1        6        3        4
## 2        a        b        d
## 3        d        d        e
## ...snip...

数值向量

但是,一列的所有元素必须属于同一类型,这样数字将被强制转换为字符。您可能更愿意只创建一个单独的数值向量。

sapply(DF, function(x) sum(x %in% products))
## seller_a seller_b seller_c 
##        6        3        4 

S3

这可能有点过头了,但可以创建一个新的 S3 class,将产品编号存储为数字属性而不是一行,但在打印时将其显示为一行。

as.data.frame1 <- function(x, ...) UseMethod("as.data.frame1")

as.data.frame1.data.frame <- function(x, product, ...) {
    out <- structure(x, class = c("data.frame1", class(x)))
    attr(out, "product") <-  sapply(DF, function(x) sum(x %in% products))
    out
}

format.data.frame1 <- function(x, ...) {
  format(as.data.frame(rbind(attr(x, "product"), x)))
}

print.data.frame1 <- function(x, ...) {
  print(format(x), ...)
}

DF1 <- as.data.frame1(DF, products)

DF1
##   seller_a seller_b seller_c
## 1        6        3        4
## 2        a        b        d
## 3        d        d        e
## ...snip...

attr(DF1, "product")  # numeric vector
## seller_a seller_b seller_c 
##        6        3        4 

as.data.frame(DF1)
##   seller_a seller_b seller_c
## 1        a        b        d
## 2        d        d        e
## 3        g        g        g
## ...snip...

备注

products <- scan(text = "a, b, d, f, g, h, i, j, m, o, t, z", 
  what = "", sep = ",", strip.white = TRUE)
Lines <- "seller_a seller_b seller_c
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w"
DF <- read.table(text = Lines, header = TRUE)

dplyr

中使用 summariseacross
library(dplyr)

DF %>% summarise(across(everything(), ~as.character(sum(. %in% products)))) %>%
  bind_rows(., DF)
#>   seller_a seller_b seller_c
#> 1        6        3        4
#> 2        a        b        d
#> 3        d        d        e
#> 4        g        g        g
#> 5        h        l        h
#> 6        t        n        t
#> 7        z        y        w

reprex package (v2.0.0)

创建于 2021-06-07

我们可以使用

library(dplyr)
df %>% 
    summarise(across(everything(), ~ c(sum(products %in% .), .)))

-输出

# A tibble: 7 x 3
  a     b     c    
  <chr> <chr> <chr>
1 6     3     4    
2 a     b     d    
3 d     d     e    
4 g     g     g    
5 h     l     h    
6 t     n     t    
7 z     y     w