当它实际上是一个因素时,是否有更快的方法来重新编码字符数据?
Is there a faster way to recode character data when it's actually a factor?
我经常处理需要重新编码的字符数据。一种常见的情况是,正在记录的字符向量本质上是 的一个因素,但不一定是 class 中的一个因素。例如,考虑一个 chr
向量,例如 vec
:
set.seed(2021)
vec <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10))
vec
#> [1] "animal_dog_xyz" "animal_alligator_tyl" "animal_cat_abc"
#> [4] "animal_cat_abc" "animal_alligator_tyl" "animal_alligator_tyl"
#> [7] "animal_cat_abc" "animal_cat_abc" "animal_cat_abc"
#> [10] "animal_dog_xyz" "animal_dog_xyz" "animal_cat_abc"
#> [13] "animal_alligator_tyl" "animal_alligator_tyl" "animal_alligator_tyl"
#> [16] "animal_cat_abc" "animal_dog_xyz" "animal_alligator_tyl"
#> [19] "animal_alligator_tyl" "animal_cat_abc" "animal_dog_xyz"
#> [22] "animal_cat_abc" "animal_cat_abc" "animal_dog_xyz"
#> [25] "animal_dog_xyz" "animal_dog_xyz" "animal_dog_xyz"
#> [28] "animal_dog_xyz" "animal_alligator_tyl" "animal_alligator_tyl"
由 reprex package (v2.0.0)
于 2021-07-19 创建
如果我想重新编码这个向量并只提取动物名称,我会选择适合字符数据的解决方案:
library(stringr)
sapply(str_split(vec, "_", n = 3), `[`, 2)
#> [1] "dog" "alligator" "cat" "cat" "alligator" "alligator"
#> [7] "cat" "cat" "cat" "dog" "dog" "cat"
#> [13] "alligator" "alligator" "alligator" "cat" "dog" "alligator"
#> [19] "alligator" "cat" "dog" "cat" "cat" "dog"
#> [25] "dog" "dog" "dog" "dog" "alligator" "alligator"
问题
如果向量很长,这样的重新编码过程需要很长时间。 R 将遍历每个向量元素并应用该过程。鉴于向量中只有 3 个唯一值,这似乎效率低下。换句话说,我们不需要一个一个地检查元素并找出重新编码的值应该是什么。
此处,vec_long
的长度为 30000。这是在我的机器上重新编码所需的时间。
vec_long <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10000))
length(vec_long)
#> [1] 30000
library(microbenchmark)
microbenchmark(sapply(str_split(vec_long, "_", n = 3), `[`, 2))
#> Unit: milliseconds
#> expr min lq mean
#> sapply(str_split(vec_long, "_", n = 3), `[`, 2) 51.6972 52.66918 57.42299
#> median uq max neval
#> 54.47867 58.7653 115.754 100
有没有办法利用这个向量实际上是 factor
的事实?从而识别唯一值(“级别”),重新编码它们,并重新部署到整个矢量长度?有没有这样的程序可以加快处理时间?
谢谢!
编辑
我只想根据@GKi 的回答、@ThomasIsCoding 的回答和@user20650 的评论总结我的测试。
## The Data
set.seed(2021)
unique_vals <- c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl")
vec <- sample(rep(unique_vals, 10))
vec_long <- sample(rep(unique_vals, 1000))
vec_very_long <- sample(unique_vals, 100000))
## The functions
## function #1 -- as @user20650 proposed
via_fac_levels <- function(x) {
x_factor <- factor(x)
levels(x_factor) <- sapply(str_split(levels(x_factor), "_", n = 3), `[`, 2)
as.character(x_factor)
}
####################
## function #2 -- as @GKi proposed
via_fac_no_levels <- function(x) {
x_factor <- as.factor(x)
x_factor <- sapply(strsplit(levels(x_factor), "_", TRUE), `[`, 2)[x_factor]
as.character(x_factor)
}
####################
## function #3 -- the original slow method shown in the question
via_chr_only <- function(x) {
sapply(str_split(x, "_", n = 3), `[`, 2)
}
####################
## function #4 -- as @ThomasIsCoding proposed
via_read_table <- function(x) {
read.table(text = paste0(x, collapse = "\n"), sep = "_", header = FALSE)$V2
}
###################
## function #5 -- forcats::fct_relabel()
via_fct_relabel <- function(x) {
x_factor <- as.factor(x)
x_factor <- fct_relabel(x_factor, ~sapply(str_split(.x, "_", n = 3), `[`, 2))
as.character(x_factor)
}
## Performance assessment
### I ran it on Rstudio cloud
bm_short <- bench::mark(fac_levels = via_fac_levels(vec),
fac_no_levels = via_fac_no_levels(vec),
chr = via_chr_only(vec),
read_t = via_read_table(vec),
fct_relabel = via_fct_relabel(vec),
iterations = 1000)
bm_long <- bench::mark(fac_levels = via_fac_levels(vec_long),
fac_no_levels = via_fac_no_levels(vec_long),
chr = via_chr_only(vec_long),
read_t = via_read_table(vec_long),
fct_relabel = via_fct_relabel(vec_long),
iterations = 1000)
bm_very_long <- bench::mark(fac_levels = via_fac_levels(vec_very_long),
fac_no_levels = via_fac_no_levels(vec_very_long),
chr = via_chr_only(vec_very_long),
read_t = via_read_table(vec_very_long),
fct_relabel = via_fct_relabel(vec_very_long),
iterations = 1000)
## visualize
library(ggplot2)
library(tidyr)
library(ggbeeswarm)
library(beeswarm)
autoplot(bm_short) + ggtitle("data of length 30")
autoplot(bm_long) + ggtitle("data of length 3000")
autoplot(bm_very_long) + ggtitle("data of length 300000")
## verify all functions give the same output
v1 <- via_fac_levels(vec_long)
v2 <- via_fac_no_levels(vec_long)
v3 <- via_chr_only(vec_long)
v4 <- via_read_table(vec_long)
v5 <- via_fct_relabel(vec_long)
all(sapply(list(v1, v2, v3, v4), FUN = identical, v5)) #
## [1] TRUE
如果你创建一个因素,这也需要一些时间,它可能看起来像:
vec_fac <- as.factor(vec_long)
sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]
如果数据也需要重新编码:
levels(vec_fac) <- sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)
vec_fac
基准:
bench::mark(
Question = sapply(strsplit(vec_long, "_", TRUE), `[`, 2)
, HaveFactor = sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]
, CreateFactor = {vec_fac <- as.factor(vec_long)
sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]}
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 Question 16.2ms 16.4ms 59.6 1.17MB 51.7 15 13
#2 HaveFactor 233.9µs 251µs 3757. 234.42KB 16.8 1788 8
#3 CreateFactor 882.8µs 920.2µs 1073. 959.36KB 19.2 503 9
您可以像下面那样尝试read.table
read.table(text = vec, sep = "_", header = FALSE)$V2
这给出了
> read.table(text = vec, sep = "_", header = FALSE)$V2
[1] "dog" "alligator" "cat" "cat" "alligator" "alligator"
[7] "cat" "cat" "cat" "dog" "dog" "cat"
[13] "alligator" "alligator" "alligator" "cat" "dog" "alligator"
[19] "alligator" "cat" "dog" "cat" "cat" "dog"
[25] "dog" "dog" "dog" "dog" "alligator" "alligator"
我还考虑过拥有一个允许使用正则表达式重新编码因子的函数的想法。参见 。
简而言之
remotes::install_github("jwilliman/xfactor")
as.character(
xfactor::xfactor(vec_very_long, levels = c(dog = "dog", cat = "cat", alligator = "alligator"))
)
引号中的位是正则表达式,名称是要替换它们的级别。
这比上面最快的答案花费的时间还不到两倍,但在允许对每个结果因子水平应用不同的正则表达式模式方面具有更大的灵活性。 IE。适合清理杂乱的数据。
我意识到解决这个问题的一种方法是 forcats::fct_relabel()
。
library(forcats)
library(stringr)
via_fct_relabel <- function(x) {
x_factor <- as.factor(x)
x_factor <- fct_relabel(x_factor, ~sapply(str_split(.x, "_", n = 3), `[`, 2))
as.character(x_factor)
}
vec_long <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10000))
result <- via_fct_relabel(vec_long)
head(result, n = 20)
#> [1] "cat" "dog" "dog" "cat" "cat" "alligator"
#> [7] "cat" "alligator" "alligator" "cat" "alligator" "dog"
#> [13] "cat" "alligator" "dog" "cat" "cat" "dog"
#> [19] "cat" "dog"
由 reprex package (v2.0.0)
于 2021-07-21 创建
我经常处理需要重新编码的字符数据。一种常见的情况是,正在记录的字符向量本质上是 的一个因素,但不一定是 class 中的一个因素。例如,考虑一个 chr
向量,例如 vec
:
set.seed(2021)
vec <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10))
vec
#> [1] "animal_dog_xyz" "animal_alligator_tyl" "animal_cat_abc"
#> [4] "animal_cat_abc" "animal_alligator_tyl" "animal_alligator_tyl"
#> [7] "animal_cat_abc" "animal_cat_abc" "animal_cat_abc"
#> [10] "animal_dog_xyz" "animal_dog_xyz" "animal_cat_abc"
#> [13] "animal_alligator_tyl" "animal_alligator_tyl" "animal_alligator_tyl"
#> [16] "animal_cat_abc" "animal_dog_xyz" "animal_alligator_tyl"
#> [19] "animal_alligator_tyl" "animal_cat_abc" "animal_dog_xyz"
#> [22] "animal_cat_abc" "animal_cat_abc" "animal_dog_xyz"
#> [25] "animal_dog_xyz" "animal_dog_xyz" "animal_dog_xyz"
#> [28] "animal_dog_xyz" "animal_alligator_tyl" "animal_alligator_tyl"
由 reprex package (v2.0.0)
于 2021-07-19 创建如果我想重新编码这个向量并只提取动物名称,我会选择适合字符数据的解决方案:
library(stringr)
sapply(str_split(vec, "_", n = 3), `[`, 2)
#> [1] "dog" "alligator" "cat" "cat" "alligator" "alligator"
#> [7] "cat" "cat" "cat" "dog" "dog" "cat"
#> [13] "alligator" "alligator" "alligator" "cat" "dog" "alligator"
#> [19] "alligator" "cat" "dog" "cat" "cat" "dog"
#> [25] "dog" "dog" "dog" "dog" "alligator" "alligator"
问题
如果向量很长,这样的重新编码过程需要很长时间。 R 将遍历每个向量元素并应用该过程。鉴于向量中只有 3 个唯一值,这似乎效率低下。换句话说,我们不需要一个一个地检查元素并找出重新编码的值应该是什么。
此处,vec_long
的长度为 30000。这是在我的机器上重新编码所需的时间。
vec_long <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10000))
length(vec_long)
#> [1] 30000
library(microbenchmark)
microbenchmark(sapply(str_split(vec_long, "_", n = 3), `[`, 2))
#> Unit: milliseconds
#> expr min lq mean
#> sapply(str_split(vec_long, "_", n = 3), `[`, 2) 51.6972 52.66918 57.42299
#> median uq max neval
#> 54.47867 58.7653 115.754 100
有没有办法利用这个向量实际上是 factor
的事实?从而识别唯一值(“级别”),重新编码它们,并重新部署到整个矢量长度?有没有这样的程序可以加快处理时间?
谢谢!
编辑
我只想根据@GKi 的回答、@ThomasIsCoding 的回答和@user20650 的评论总结我的测试。
## The Data
set.seed(2021)
unique_vals <- c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl")
vec <- sample(rep(unique_vals, 10))
vec_long <- sample(rep(unique_vals, 1000))
vec_very_long <- sample(unique_vals, 100000))
## The functions
## function #1 -- as @user20650 proposed
via_fac_levels <- function(x) {
x_factor <- factor(x)
levels(x_factor) <- sapply(str_split(levels(x_factor), "_", n = 3), `[`, 2)
as.character(x_factor)
}
####################
## function #2 -- as @GKi proposed
via_fac_no_levels <- function(x) {
x_factor <- as.factor(x)
x_factor <- sapply(strsplit(levels(x_factor), "_", TRUE), `[`, 2)[x_factor]
as.character(x_factor)
}
####################
## function #3 -- the original slow method shown in the question
via_chr_only <- function(x) {
sapply(str_split(x, "_", n = 3), `[`, 2)
}
####################
## function #4 -- as @ThomasIsCoding proposed
via_read_table <- function(x) {
read.table(text = paste0(x, collapse = "\n"), sep = "_", header = FALSE)$V2
}
###################
## function #5 -- forcats::fct_relabel()
via_fct_relabel <- function(x) {
x_factor <- as.factor(x)
x_factor <- fct_relabel(x_factor, ~sapply(str_split(.x, "_", n = 3), `[`, 2))
as.character(x_factor)
}
## Performance assessment
### I ran it on Rstudio cloud
bm_short <- bench::mark(fac_levels = via_fac_levels(vec),
fac_no_levels = via_fac_no_levels(vec),
chr = via_chr_only(vec),
read_t = via_read_table(vec),
fct_relabel = via_fct_relabel(vec),
iterations = 1000)
bm_long <- bench::mark(fac_levels = via_fac_levels(vec_long),
fac_no_levels = via_fac_no_levels(vec_long),
chr = via_chr_only(vec_long),
read_t = via_read_table(vec_long),
fct_relabel = via_fct_relabel(vec_long),
iterations = 1000)
bm_very_long <- bench::mark(fac_levels = via_fac_levels(vec_very_long),
fac_no_levels = via_fac_no_levels(vec_very_long),
chr = via_chr_only(vec_very_long),
read_t = via_read_table(vec_very_long),
fct_relabel = via_fct_relabel(vec_very_long),
iterations = 1000)
## visualize
library(ggplot2)
library(tidyr)
library(ggbeeswarm)
library(beeswarm)
autoplot(bm_short) + ggtitle("data of length 30")
autoplot(bm_long) + ggtitle("data of length 3000")
autoplot(bm_very_long) + ggtitle("data of length 300000")
## verify all functions give the same output
v1 <- via_fac_levels(vec_long)
v2 <- via_fac_no_levels(vec_long)
v3 <- via_chr_only(vec_long)
v4 <- via_read_table(vec_long)
v5 <- via_fct_relabel(vec_long)
all(sapply(list(v1, v2, v3, v4), FUN = identical, v5)) #
## [1] TRUE
如果你创建一个因素,这也需要一些时间,它可能看起来像:
vec_fac <- as.factor(vec_long)
sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]
如果数据也需要重新编码:
levels(vec_fac) <- sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)
vec_fac
基准:
bench::mark(
Question = sapply(strsplit(vec_long, "_", TRUE), `[`, 2)
, HaveFactor = sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]
, CreateFactor = {vec_fac <- as.factor(vec_long)
sapply(strsplit(levels(vec_fac), "_", TRUE), `[`, 2)[vec_fac]}
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 Question 16.2ms 16.4ms 59.6 1.17MB 51.7 15 13
#2 HaveFactor 233.9µs 251µs 3757. 234.42KB 16.8 1788 8
#3 CreateFactor 882.8µs 920.2µs 1073. 959.36KB 19.2 503 9
您可以像下面那样尝试read.table
read.table(text = vec, sep = "_", header = FALSE)$V2
这给出了
> read.table(text = vec, sep = "_", header = FALSE)$V2
[1] "dog" "alligator" "cat" "cat" "alligator" "alligator"
[7] "cat" "cat" "cat" "dog" "dog" "cat"
[13] "alligator" "alligator" "alligator" "cat" "dog" "alligator"
[19] "alligator" "cat" "dog" "cat" "cat" "dog"
[25] "dog" "dog" "dog" "dog" "alligator" "alligator"
我还考虑过拥有一个允许使用正则表达式重新编码因子的函数的想法。参见
简而言之
remotes::install_github("jwilliman/xfactor")
as.character(
xfactor::xfactor(vec_very_long, levels = c(dog = "dog", cat = "cat", alligator = "alligator"))
)
引号中的位是正则表达式,名称是要替换它们的级别。
这比上面最快的答案花费的时间还不到两倍,但在允许对每个结果因子水平应用不同的正则表达式模式方面具有更大的灵活性。 IE。适合清理杂乱的数据。
我意识到解决这个问题的一种方法是 forcats::fct_relabel()
。
library(forcats)
library(stringr)
via_fct_relabel <- function(x) {
x_factor <- as.factor(x)
x_factor <- fct_relabel(x_factor, ~sapply(str_split(.x, "_", n = 3), `[`, 2))
as.character(x_factor)
}
vec_long <- sample(rep(c("animal_dog_xyz", "animal_cat_abc", "animal_alligator_tyl"), 10000))
result <- via_fct_relabel(vec_long)
head(result, n = 20)
#> [1] "cat" "dog" "dog" "cat" "cat" "alligator"
#> [7] "cat" "alligator" "alligator" "cat" "alligator" "dog"
#> [13] "cat" "alligator" "dog" "cat" "cat" "dog"
#> [19] "cat" "dog"
由 reprex package (v2.0.0)
于 2021-07-21 创建