跨行乘法和加法
Multiplying and Adding Values across Rows
我有这个数据框:
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
id color_1
1 1 KAK
2 2 AKZ
3 3 KAK
4 4 KAK
5 5 AKZ
6 6 ZZA
假设有一个传说:
- K=3
- 一个=4
- Z=6
我想在上面的数据框中添加两列:
- sample_data$add_score :例如KAK = K + A + K = 3 + 4 + 3 = 10
- sample_data$multiply_score :例如KAK = K * A * K = 3 * 4 * 3 = 36
我想到了这样解决问题:
sample_data$first = substr(color_1,1,1)
sample_data$second = substr(color_1,2,2)
sample_data$third = substr(color_1,3,3)
sample_data$first_score = ifelse(sample_data$first == "K", 3, ifelse(sample_data$first == "A", 4, 6))
sample_data$second_score = ifelse(sample_data$second == "K", 3, ifelse(sample_data$second == "A", 4, 6))
sample_data$third_score = ifelse(sample_data$third == "K", 3, ifelse(sample_data$third == "A", 4, 6))
sample_data$add_score = sample_data$first_score + sample_data$second_score + sample_data$third_score
sample_data$multiply_score = sample_data$first_score * sample_data$second_score * sample_data$third_score
但我觉得如果“color_1”的长度再长一点,这种方式会耗费很长时间。给定一个得分传奇,有没有更快的方法来做到这一点?
谢谢!
我们可以用stri_replace_all_regex
把你的color_1
和算术运算符一起换成整数
这里我将您的值存储到向量中 color_1_convert
。我们可以将其用作 stri_replace_all_regex
中的输入,以便更好地管理值。
library(dplyr)
library(stringi)
color_1_convert <- c("K" = "3", "A" = "4", "Z" = "6")
sample_data %>%
group_by(id) %>%
mutate(add_score = eval(parse(text = gsub("\+$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "+"), vectorize_all = F)))),
multiply_score = eval(parse(text = gsub("\*$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "*"), vectorize_all = F)))))
# A tibble: 100 × 4
# Groups: id [100]
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 KAK 10 36
2 2 ZZA 16 144
3 3 AKZ 13 72
4 4 ZZA 16 144
5 5 AKZ 13 72
6 6 AKZ 13 72
7 7 AKZ 13 72
8 8 KAK 10 36
9 9 ZZA 16 144
10 10 AKZ 13 72
# … with 90 more rows
这是一个方法。
主要技巧是将 strsplit
转换为单个字符并将这些向量与图例匹配。然后将匹配的数字相加或相乘。
set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id <- 1:100
sample_data = data.frame(id, color_1)
legend <- setNames(c(3, 4, 6), c("K", "A", "Z"))
add_mul <- function(x, l){
add <- function(y, l){
i <- match(y, names(l))
sum(l[i])
}
mul <- function(y, l){
i <- match(y, names(l))
prod(l[i])
}
s <- strsplit(x, "")
add_score <- sapply(s, add, l = l)
mul_score <- sapply(s, mul, l = l)
data.frame(add_score, mul_score)
}
sample_data <- cbind(sample_data, add_mul(sample_data$color_1, legend))
head(sample_data)
#> id color_1 add_score mul_score
#> 1 1 ZZA 16 144
#> 2 2 KAK 10 36
#> 3 3 AKZ 13 72
#> 4 4 KAK 10 36
#> 5 5 AKZ 13 72
#> 6 6 KAK 10 36
由 reprex package (v2.0.1)
于 2022-03-10 创建
这是一种基于 tidyr::separate_rows()
后跟分组 dplyr::summarize()
的方法:
library(tidyverse)
set.seed(1)
legend <- c(K = 3, A = 4, Z = 6)
sample_data %>%
mutate(decoded = color_1) %>%
separate_rows(decoded, sep = "(?!^)") %>%
mutate(decoded = legend[decoded]) %>%
group_by(id, color_1) %>%
summarize(
add_score = sum(decoded),
multiply_score = prod(decoded),
.groups = "drop"
)
输出:
# A tibble: 100 x 4
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 AKZ 13 72
2 2 AKZ 13 72
3 3 KAK 10 36
4 4 ZZA 16 144
5 5 AKZ 13 72
6 6 ZZA 16 144
7 7 ZZA 16 144
8 8 KAK 10 36
9 9 KAK 10 36
10 10 AKZ 13 72
# ... with 90 more rows
这是另一个使用 tidyverse
的选项。我使用 dplyr
中的 recode
根据 legend
.
将字母更改为数字
library(tidyverse)
legend <- c(K = 3, A = 4, Z = 6)
sample_data %>%
rowwise %>%
mutate(code = list(recode(str_split(color_1, "", simplify = T), !!!legend)),
add_score = sum(code),
multiply_score = prod(code)) %>%
select(-code)
输出
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 AKZ 13 72
2 2 AKZ 13 72
3 3 KAK 10 36
4 4 KAK 10 36
5 5 AKZ 13 72
6 6 AKZ 13 72
7 7 KAK 10 36
8 8 AKZ 13 72
9 9 AKZ 13 72
10 10 AKZ 13 72
# … with 90 more rows
数据
set.seed(103)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
基准
看起来 Rui Barradas 的解决方案是迄今为止最快的解决方案。
还有另一种选择,使用一些针对速度优化的库,stringi
用于字符串操作,Rfast
用于矩阵操作。请注意,当您的数据中存在任何 NA
值时,matrixStats
使用起来比 Rfast 更安全。
set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
m <- strsplit(sample_data[["color_1"]], "") |>
unlist(use.names = F) |>
stringi::stri_replace_all_regex(
c("K", "A", "Z"),
c("3", "4", "6"), vectorize_all = F) |>
as.integer() |>
matrix(ncol = 3, byrow = T)
sample_data$add_score <- Rfast::rowsums(m)
sample_data$mul_score <- Rfast::rowprods(m)
head(sample_data)
id color_1 add_score mul_score
1 1 ZZA 16 144
2 2 KAK 10 36
3 3 AKZ 13 72
4 4 KAK 10 36
5 5 AKZ 13 72
6 6 KAK 10 36
我有这个数据框:
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
id color_1
1 1 KAK
2 2 AKZ
3 3 KAK
4 4 KAK
5 5 AKZ
6 6 ZZA
假设有一个传说:
- K=3
- 一个=4
- Z=6
我想在上面的数据框中添加两列:
- sample_data$add_score :例如KAK = K + A + K = 3 + 4 + 3 = 10
- sample_data$multiply_score :例如KAK = K * A * K = 3 * 4 * 3 = 36
我想到了这样解决问题:
sample_data$first = substr(color_1,1,1)
sample_data$second = substr(color_1,2,2)
sample_data$third = substr(color_1,3,3)
sample_data$first_score = ifelse(sample_data$first == "K", 3, ifelse(sample_data$first == "A", 4, 6))
sample_data$second_score = ifelse(sample_data$second == "K", 3, ifelse(sample_data$second == "A", 4, 6))
sample_data$third_score = ifelse(sample_data$third == "K", 3, ifelse(sample_data$third == "A", 4, 6))
sample_data$add_score = sample_data$first_score + sample_data$second_score + sample_data$third_score
sample_data$multiply_score = sample_data$first_score * sample_data$second_score * sample_data$third_score
但我觉得如果“color_1”的长度再长一点,这种方式会耗费很长时间。给定一个得分传奇,有没有更快的方法来做到这一点?
谢谢!
我们可以用stri_replace_all_regex
把你的color_1
和算术运算符一起换成整数
这里我将您的值存储到向量中 color_1_convert
。我们可以将其用作 stri_replace_all_regex
中的输入,以便更好地管理值。
library(dplyr)
library(stringi)
color_1_convert <- c("K" = "3", "A" = "4", "Z" = "6")
sample_data %>%
group_by(id) %>%
mutate(add_score = eval(parse(text = gsub("\+$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "+"), vectorize_all = F)))),
multiply_score = eval(parse(text = gsub("\*$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "*"), vectorize_all = F)))))
# A tibble: 100 × 4
# Groups: id [100]
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 KAK 10 36
2 2 ZZA 16 144
3 3 AKZ 13 72
4 4 ZZA 16 144
5 5 AKZ 13 72
6 6 AKZ 13 72
7 7 AKZ 13 72
8 8 KAK 10 36
9 9 ZZA 16 144
10 10 AKZ 13 72
# … with 90 more rows
这是一个方法。
主要技巧是将 strsplit
转换为单个字符并将这些向量与图例匹配。然后将匹配的数字相加或相乘。
set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id <- 1:100
sample_data = data.frame(id, color_1)
legend <- setNames(c(3, 4, 6), c("K", "A", "Z"))
add_mul <- function(x, l){
add <- function(y, l){
i <- match(y, names(l))
sum(l[i])
}
mul <- function(y, l){
i <- match(y, names(l))
prod(l[i])
}
s <- strsplit(x, "")
add_score <- sapply(s, add, l = l)
mul_score <- sapply(s, mul, l = l)
data.frame(add_score, mul_score)
}
sample_data <- cbind(sample_data, add_mul(sample_data$color_1, legend))
head(sample_data)
#> id color_1 add_score mul_score
#> 1 1 ZZA 16 144
#> 2 2 KAK 10 36
#> 3 3 AKZ 13 72
#> 4 4 KAK 10 36
#> 5 5 AKZ 13 72
#> 6 6 KAK 10 36
由 reprex package (v2.0.1)
于 2022-03-10 创建这是一种基于 tidyr::separate_rows()
后跟分组 dplyr::summarize()
的方法:
library(tidyverse)
set.seed(1)
legend <- c(K = 3, A = 4, Z = 6)
sample_data %>%
mutate(decoded = color_1) %>%
separate_rows(decoded, sep = "(?!^)") %>%
mutate(decoded = legend[decoded]) %>%
group_by(id, color_1) %>%
summarize(
add_score = sum(decoded),
multiply_score = prod(decoded),
.groups = "drop"
)
输出:
# A tibble: 100 x 4
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 AKZ 13 72
2 2 AKZ 13 72
3 3 KAK 10 36
4 4 ZZA 16 144
5 5 AKZ 13 72
6 6 ZZA 16 144
7 7 ZZA 16 144
8 8 KAK 10 36
9 9 KAK 10 36
10 10 AKZ 13 72
# ... with 90 more rows
这是另一个使用 tidyverse
的选项。我使用 dplyr
中的 recode
根据 legend
.
library(tidyverse)
legend <- c(K = 3, A = 4, Z = 6)
sample_data %>%
rowwise %>%
mutate(code = list(recode(str_split(color_1, "", simplify = T), !!!legend)),
add_score = sum(code),
multiply_score = prod(code)) %>%
select(-code)
输出
id color_1 add_score multiply_score
<int> <chr> <dbl> <dbl>
1 1 AKZ 13 72
2 2 AKZ 13 72
3 3 KAK 10 36
4 4 KAK 10 36
5 5 AKZ 13 72
6 6 AKZ 13 72
7 7 KAK 10 36
8 8 AKZ 13 72
9 9 AKZ 13 72
10 10 AKZ 13 72
# … with 90 more rows
数据
set.seed(103)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
基准
看起来 Rui Barradas 的解决方案是迄今为止最快的解决方案。
还有另一种选择,使用一些针对速度优化的库,stringi
用于字符串操作,Rfast
用于矩阵操作。请注意,当您的数据中存在任何 NA
值时,matrixStats
使用起来比 Rfast 更安全。
set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)
m <- strsplit(sample_data[["color_1"]], "") |>
unlist(use.names = F) |>
stringi::stri_replace_all_regex(
c("K", "A", "Z"),
c("3", "4", "6"), vectorize_all = F) |>
as.integer() |>
matrix(ncol = 3, byrow = T)
sample_data$add_score <- Rfast::rowsums(m)
sample_data$mul_score <- Rfast::rowprods(m)
head(sample_data)
id color_1 add_score mul_score
1 1 ZZA 16 144
2 2 KAK 10 36
3 3 AKZ 13 72
4 4 KAK 10 36
5 5 AKZ 13 72
6 6 KAK 10 36