跨行乘法和加法

Multiplying and Adding Values across Rows

我有这个数据框:

color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100

sample_data = data.frame(id, color_1)


 id color_1
1  1     KAK
2  2     AKZ
3  3     KAK
4  4     KAK
5  5     AKZ
6  6     ZZA

假设有一个传说:

我想在上面的数据框中添加两列:

我想到了这样解决问题:

sample_data$first = substr(color_1,1,1)
sample_data$second = substr(color_1,2,2)
sample_data$third = substr(color_1,3,3)

sample_data$first_score = ifelse(sample_data$first == "K", 3, ifelse(sample_data$first == "A", 4, 6))
 
sample_data$second_score = ifelse(sample_data$second == "K", 3, ifelse(sample_data$second == "A", 4, 6))

sample_data$third_score = ifelse(sample_data$third == "K", 3, ifelse(sample_data$third == "A", 4, 6))

sample_data$add_score = sample_data$first_score + sample_data$second_score + sample_data$third_score

sample_data$multiply_score = sample_data$first_score * sample_data$second_score * sample_data$third_score

但我觉得如果“color_1”的长度再长一点,这种方式会耗费很长时间。给定一个得分传奇,有没有更快的方法来做到这一点?

谢谢!

我们可以用stri_replace_all_regex把你的color_1和算术运算符一起换成整数

这里我将您的值存储到向量中 color_1_convert。我们可以将其用作 stri_replace_all_regex 中的输入,以便更好地管理值。

library(dplyr)
library(stringi)

color_1_convert <- c("K" = "3", "A" = "4", "Z" = "6")

sample_data %>%
  group_by(id) %>%
  mutate(add_score = eval(parse(text = gsub("\+$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "+"), vectorize_all = F)))),
         multiply_score = eval(parse(text = gsub("\*$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "*"), vectorize_all = F)))))

# A tibble: 100 × 4
# Groups:   id [100]
      id color_1 add_score multiply_score
   <int> <chr>       <dbl>          <dbl>
 1     1 KAK            10             36
 2     2 ZZA            16            144
 3     3 AKZ            13             72
 4     4 ZZA            16            144
 5     5 AKZ            13             72
 6     6 AKZ            13             72
 7     7 AKZ            13             72
 8     8 KAK            10             36
 9     9 ZZA            16            144
10    10 AKZ            13             72
# … with 90 more rows

这是一个方法。
主要技巧是将 strsplit 转换为单个字符并将这些向量与图例匹配。然后将匹配的数字相加或相乘。

set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id <- 1:100

sample_data = data.frame(id, color_1)

legend <- setNames(c(3, 4, 6), c("K", "A", "Z"))

add_mul <- function(x, l){
  add <- function(y, l){
    i <- match(y, names(l))
    sum(l[i])
  }
  mul <- function(y, l){
    i <- match(y, names(l))
    prod(l[i])
  }
  
  s <- strsplit(x, "")
  add_score <- sapply(s, add, l = l)
  mul_score <- sapply(s, mul, l = l)
  data.frame(add_score, mul_score)
}

sample_data <- cbind(sample_data, add_mul(sample_data$color_1, legend))
head(sample_data)
#>   id color_1 add_score mul_score
#> 1  1     ZZA        16       144
#> 2  2     KAK        10        36
#> 3  3     AKZ        13        72
#> 4  4     KAK        10        36
#> 5  5     AKZ        13        72
#> 6  6     KAK        10        36

reprex package (v2.0.1)

于 2022-03-10 创建

这是一种基于 tidyr::separate_rows() 后跟分组 dplyr::summarize() 的方法:

library(tidyverse)
set.seed(1)

legend <- c(K = 3, A = 4, Z = 6)

sample_data %>%
  mutate(decoded = color_1) %>%
  separate_rows(decoded, sep = "(?!^)") %>%
  mutate(decoded = legend[decoded]) %>%
  group_by(id, color_1) %>%
  summarize(
    add_score = sum(decoded),
    multiply_score = prod(decoded), 
    .groups = "drop"
  )

输出:

# A tibble: 100 x 4
      id color_1 add_score multiply_score
   <int> <chr>       <dbl>          <dbl>
 1     1 AKZ            13             72
 2     2 AKZ            13             72
 3     3 KAK            10             36
 4     4 ZZA            16            144
 5     5 AKZ            13             72
 6     6 ZZA            16            144
 7     7 ZZA            16            144
 8     8 KAK            10             36
 9     9 KAK            10             36
10    10 AKZ            13             72
# ... with 90 more rows

这是另一个使用 tidyverse 的选项。我使用 dplyr 中的 recode 根据 legend.

将字母更改为数字
library(tidyverse)

legend <- c(K = 3, A = 4, Z = 6)

sample_data %>%
  rowwise %>%
  mutate(code = list(recode(str_split(color_1, "", simplify = T), !!!legend)),
         add_score = sum(code),
         multiply_score = prod(code)) %>%
  select(-code)

输出

      id color_1 add_score multiply_score
   <int> <chr>       <dbl>          <dbl>
 1     1 AKZ            13             72
 2     2 AKZ            13             72
 3     3 KAK            10             36
 4     4 KAK            10             36
 5     5 AKZ            13             72
 6     6 AKZ            13             72
 7     7 KAK            10             36
 8     8 AKZ            13             72
 9     9 AKZ            13             72
10    10 AKZ            13             72
# … with 90 more rows

数据

set.seed(103)
color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100

sample_data = data.frame(id, color_1)

基准

看起来 Rui Barradas 的解决方案是迄今为止最快的解决方案。

还有另一种选择,使用一些针对速度优化的库,stringi 用于字符串操作,Rfast 用于矩阵操作。请注意,当您的数据中存在任何 NA 值时,matrixStats 使用起来比 Rfast 更安全。

set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)

m <- strsplit(sample_data[["color_1"]], "") |>
  unlist(use.names = F) |>
  stringi::stri_replace_all_regex(
    c("K", "A", "Z"),
    c("3", "4", "6"), vectorize_all = F) |>
  as.integer() |>
  matrix(ncol = 3, byrow = T)
sample_data$add_score <- Rfast::rowsums(m)
sample_data$mul_score <- Rfast::rowprods(m)

head(sample_data)
  id color_1 add_score mul_score
1  1     ZZA        16       144
2  2     KAK        10        36
3  3     AKZ        13        72
4  4     KAK        10        36
5  5     AKZ        13        72
6  6     KAK        10        36