将数据框中的多列与外部向量进行比较
Compare multiple columns from a dataframe with an outer vector
假设我们有这个向量:
products <- c(a, b, d, f, g, h, i, j, m, o, t, z)
和下面这样的数据框:
seller_a seller_b seller_c
a b d
d d e
g g g
h l h
t n t
z y w
我想在数据框中包含一个额外的行,它会指示每个 seller
列与 products
向量相匹配的范围。
换句话说,我的目标是让原始数据框看起来像这样:
seller_a seller_b seller_c
6 3 4
a b d
d d e
g g g
h l h
t n t
z y w
数据:
df <- tibble(
a = c("a", "d", "g", "h", "t", "z"),
b = c("b", "d", "g", "l", "n", "y"),
c = c("d", "e", "g", "h", "t", "w")
)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
代码:
library(tidyverse)
df %>% rbind(map_int(., ~sum(products %in% .x)), .)
a b c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w
请注意,第一行的数字将是字符。此外,如果列是因素代码将不起作用(这就是我使用 tibble
)
的原因
使用基础 R,您可以执行以下操作
#///////////////////
#your data
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
seller_a <- c("a", "d", "g", "h", "t", "z")
seller_b <- c("b", "d", "g", "l", "n", "y")
seller_c <- c("d", "e", "g", "h", "t", "w")
d <- as.data.frame(cbind(seller_a,seller_b,seller_c))
#///////////////////
a <- c(sum(d$seller_a %in% products), sum(d$seller_b %in% products), sum(d$seller_c %in% products))
d <- rbind(a,d)
data.table
的解决方案:
library(data.table)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
DT <- data.table(
seller_a = c("a", "d", "g", "h", "t", "z"),
seller_b = c("b", "d", "g", "l", "n", "y"),
seller_c = c("d", "e", "g", "h", "t", "w")
)
DT1 <- DT[,.(seller_a = length(which(products%in%seller_a==TRUE)),
seller_b = length(which(products%in%seller_b==TRUE)),
seller_c = length(which(products%in%seller_c==TRUE)))]
# -------------------
> DT1
seller_a seller_b seller_c
1: 6 3 4
> rbind(DT1, DT)
seller_a seller_b seller_c
1: 6 3 4
2: a b d
3: d d e
4: g g g
5: h l h
6: t n t
7: z y w
基础 R 选项 -
rbind(sapply(df, function(x) sum(x %in% products)), df)
# a b c
#1 6 3 4
#2 a b d
#3 d d e
#4 g g g
#5 h l h
#6 t n t
#7 z y w
您还可以使用 tibble
中的 add_row
函数向您的数据集添加额外的行:
library(dplyr)
df %>%
add_row(seller_a = as.character(sum(df$seller_a %in% products)),
seller_b = as.character(sum(df$seller_b %in% products)),
seller_c = as.character(sum(df$seller_c %in% products)),
.before = 1)
# A tibble: 7 x 3
seller_a seller_b seller_c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w
使用末尾注释中可重复显示的输入
as.data.frame(lapply(DF, function(x) c(sum(x %in% products), x)))
## seller_a seller_b seller_c
## 1 6 3 4
## 2 a b d
## 3 d d e
## ...snip...
数值向量
但是,一列的所有元素必须属于同一类型,这样数字将被强制转换为字符。您可能更愿意只创建一个单独的数值向量。
sapply(DF, function(x) sum(x %in% products))
## seller_a seller_b seller_c
## 6 3 4
S3
这可能有点过头了,但可以创建一个新的 S3 class,将产品编号存储为数字属性而不是一行,但在打印时将其显示为一行。
as.data.frame1 <- function(x, ...) UseMethod("as.data.frame1")
as.data.frame1.data.frame <- function(x, product, ...) {
out <- structure(x, class = c("data.frame1", class(x)))
attr(out, "product") <- sapply(DF, function(x) sum(x %in% products))
out
}
format.data.frame1 <- function(x, ...) {
format(as.data.frame(rbind(attr(x, "product"), x)))
}
print.data.frame1 <- function(x, ...) {
print(format(x), ...)
}
DF1 <- as.data.frame1(DF, products)
DF1
## seller_a seller_b seller_c
## 1 6 3 4
## 2 a b d
## 3 d d e
## ...snip...
attr(DF1, "product") # numeric vector
## seller_a seller_b seller_c
## 6 3 4
as.data.frame(DF1)
## seller_a seller_b seller_c
## 1 a b d
## 2 d d e
## 3 g g g
## ...snip...
备注
products <- scan(text = "a, b, d, f, g, h, i, j, m, o, t, z",
what = "", sep = ",", strip.white = TRUE)
Lines <- "seller_a seller_b seller_c
a b d
d d e
g g g
h l h
t n t
z y w"
DF <- read.table(text = Lines, header = TRUE)
在 dplyr
中使用 summarise
和 across
library(dplyr)
DF %>% summarise(across(everything(), ~as.character(sum(. %in% products)))) %>%
bind_rows(., DF)
#> seller_a seller_b seller_c
#> 1 6 3 4
#> 2 a b d
#> 3 d d e
#> 4 g g g
#> 5 h l h
#> 6 t n t
#> 7 z y w
由 reprex package (v2.0.0)
创建于 2021-06-07
我们可以使用
library(dplyr)
df %>%
summarise(across(everything(), ~ c(sum(products %in% .), .)))
-输出
# A tibble: 7 x 3
a b c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w
假设我们有这个向量:
products <- c(a, b, d, f, g, h, i, j, m, o, t, z)
和下面这样的数据框:
seller_a seller_b seller_c
a b d
d d e
g g g
h l h
t n t
z y w
我想在数据框中包含一个额外的行,它会指示每个 seller
列与 products
向量相匹配的范围。
换句话说,我的目标是让原始数据框看起来像这样:
seller_a seller_b seller_c
6 3 4
a b d
d d e
g g g
h l h
t n t
z y w
数据:
df <- tibble(
a = c("a", "d", "g", "h", "t", "z"),
b = c("b", "d", "g", "l", "n", "y"),
c = c("d", "e", "g", "h", "t", "w")
)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
代码:
library(tidyverse)
df %>% rbind(map_int(., ~sum(products %in% .x)), .)
a b c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w
请注意,第一行的数字将是字符。此外,如果列是因素代码将不起作用(这就是我使用 tibble
)
使用基础 R,您可以执行以下操作
#///////////////////
#your data
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
seller_a <- c("a", "d", "g", "h", "t", "z")
seller_b <- c("b", "d", "g", "l", "n", "y")
seller_c <- c("d", "e", "g", "h", "t", "w")
d <- as.data.frame(cbind(seller_a,seller_b,seller_c))
#///////////////////
a <- c(sum(d$seller_a %in% products), sum(d$seller_b %in% products), sum(d$seller_c %in% products))
d <- rbind(a,d)
data.table
的解决方案:
library(data.table)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
DT <- data.table(
seller_a = c("a", "d", "g", "h", "t", "z"),
seller_b = c("b", "d", "g", "l", "n", "y"),
seller_c = c("d", "e", "g", "h", "t", "w")
)
DT1 <- DT[,.(seller_a = length(which(products%in%seller_a==TRUE)),
seller_b = length(which(products%in%seller_b==TRUE)),
seller_c = length(which(products%in%seller_c==TRUE)))]
# -------------------
> DT1
seller_a seller_b seller_c
1: 6 3 4
> rbind(DT1, DT)
seller_a seller_b seller_c
1: 6 3 4
2: a b d
3: d d e
4: g g g
5: h l h
6: t n t
7: z y w
基础 R 选项 -
rbind(sapply(df, function(x) sum(x %in% products)), df)
# a b c
#1 6 3 4
#2 a b d
#3 d d e
#4 g g g
#5 h l h
#6 t n t
#7 z y w
您还可以使用 tibble
中的 add_row
函数向您的数据集添加额外的行:
library(dplyr)
df %>%
add_row(seller_a = as.character(sum(df$seller_a %in% products)),
seller_b = as.character(sum(df$seller_b %in% products)),
seller_c = as.character(sum(df$seller_c %in% products)),
.before = 1)
# A tibble: 7 x 3
seller_a seller_b seller_c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w
使用末尾注释中可重复显示的输入
as.data.frame(lapply(DF, function(x) c(sum(x %in% products), x)))
## seller_a seller_b seller_c
## 1 6 3 4
## 2 a b d
## 3 d d e
## ...snip...
数值向量
但是,一列的所有元素必须属于同一类型,这样数字将被强制转换为字符。您可能更愿意只创建一个单独的数值向量。
sapply(DF, function(x) sum(x %in% products))
## seller_a seller_b seller_c
## 6 3 4
S3
这可能有点过头了,但可以创建一个新的 S3 class,将产品编号存储为数字属性而不是一行,但在打印时将其显示为一行。
as.data.frame1 <- function(x, ...) UseMethod("as.data.frame1")
as.data.frame1.data.frame <- function(x, product, ...) {
out <- structure(x, class = c("data.frame1", class(x)))
attr(out, "product") <- sapply(DF, function(x) sum(x %in% products))
out
}
format.data.frame1 <- function(x, ...) {
format(as.data.frame(rbind(attr(x, "product"), x)))
}
print.data.frame1 <- function(x, ...) {
print(format(x), ...)
}
DF1 <- as.data.frame1(DF, products)
DF1
## seller_a seller_b seller_c
## 1 6 3 4
## 2 a b d
## 3 d d e
## ...snip...
attr(DF1, "product") # numeric vector
## seller_a seller_b seller_c
## 6 3 4
as.data.frame(DF1)
## seller_a seller_b seller_c
## 1 a b d
## 2 d d e
## 3 g g g
## ...snip...
备注
products <- scan(text = "a, b, d, f, g, h, i, j, m, o, t, z",
what = "", sep = ",", strip.white = TRUE)
Lines <- "seller_a seller_b seller_c
a b d
d d e
g g g
h l h
t n t
z y w"
DF <- read.table(text = Lines, header = TRUE)
在 dplyr
summarise
和 across
library(dplyr)
DF %>% summarise(across(everything(), ~as.character(sum(. %in% products)))) %>%
bind_rows(., DF)
#> seller_a seller_b seller_c
#> 1 6 3 4
#> 2 a b d
#> 3 d d e
#> 4 g g g
#> 5 h l h
#> 6 t n t
#> 7 z y w
由 reprex package (v2.0.0)
创建于 2021-06-07我们可以使用
library(dplyr)
df %>%
summarise(across(everything(), ~ c(sum(products %in% .), .)))
-输出
# A tibble: 7 x 3
a b c
<chr> <chr> <chr>
1 6 3 4
2 a b d
3 d d e
4 g g g
5 h l h
6 t n t
7 z y w