将 dplyr 翻译成 data.table
Translating dplyr to data.table
所以我正在尝试翻译一些 dplyr 代码。我试图从一个将 dplyr 翻译成 data.table 的包中获得帮助,但它仍然不起作用。错误来自 dplyr
..
row_number
我需要 dplyr
代码中的所有步骤(尽管它们在这里对 mtcars
没有意义)
library(dplyr)
library(dtplyr) # from https://github.com/tidyverse/dtplyr
library(data.table)
mtcars %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup()
# using dtplyr
dt <- lazy_dt(mtcars)
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = c("A",
#> "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
#> "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")[row_number()]),
#> keyby = .(am)]
# I then use the query from dtplyr
DT <- as.data.table(mtcars)
unique(DT, by = "mpg")[order(am, mpg)][, `:=`(row_num = c("A",
"B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S",
"T", "U", "V", "W", "X", "Y",
"Z")[row_number()]), keyby = .(am)]
#> row_number() should only be called in a data context
由 reprex package (v0.3.0)
于 2019-07-14 创建
我们可以使用seq_len(.N)
unique(DT, by = "mpg")[order(am, mpg)][,
`:=`(row_num = LETTERS[seq_len(.N)]), by = .(am)][]
我可以推荐 rowid 函数吗?它执行分组步骤 "under the hood" 您可能会发现它看起来更干净:
unique(DT, by='mpg')[order(am, mpg), row_num := LETTERS[rowid(am)]]
如果你喜欢链接,你也可以把所有东西都放在里面 []
:
DT[ , .SD[1L], by = mpg
][order(am, mpg), row_num := LETTERS[rowid(am)]]
我正在尝试对翻译进行一些调整,以便 dtplyr 自动生成更像您想要的内容:
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
dt <- lazy_dt(mtcars)
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = ..LETTERS[seq_len(.N)]),
#> keyby = .(am)]
或者像@MichaelChirico 建议的那样避免分组:
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
arrange(am, mpg) %>%
mutate(row_num = LETTERS[row_number(am)]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = ..LETTERS[frank(am,
#> ties.method = "first", na.last = "keep")])]
(在 LETTERS
前面使用 ..
是一项 data.table 功能,可以清楚地表明您指的是数据框之外的变量;它可能不是在这里是必要的,但我认为安全总比后悔好。)
由于 data.table
语法受到严重批评,下面是 akrun answer 的两个版本,恕我直言,语法更清晰。
我发现当 data.table
代码多次使用 [
进行管道传输时,特别是当有 :=
调用时(mutate
在 dplyr 中),我发现它更难理解。
library(data.table)
dt = as.data.table(mtcars)
dt = unique(dt, by = "mpg")
dt = dt[order(am, mpg)]
dt[, row_num:=LETTERS[seq_len(.N)], by=am]
dt[1:3]
mpg cyl disp hp drat wt qsec vs am gear carb row_num
1: 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4 A
2: 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4 B
3: 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4 C
另一种选择是使用 %>%
管道。
library(magrittr)
dt = as.data.table(mtcars)
dt = unique(dt, by = "mpg") %>%
.[order(am, mpg)] %>%
.[, row_num:=LETTERS[seq_len(.N)], by=am]
dt[1:3]
# mpg cyl disp hp drat wt qsec vs am gear carb row_num
# 1: 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4 A
# 2: 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4 B
# 3: 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4 C
所以我正在尝试翻译一些 dplyr 代码。我试图从一个将 dplyr 翻译成 data.table 的包中获得帮助,但它仍然不起作用。错误来自 dplyr
..
row_number
我需要 dplyr
代码中的所有步骤(尽管它们在这里对 mtcars
没有意义)
library(dplyr)
library(dtplyr) # from https://github.com/tidyverse/dtplyr
library(data.table)
mtcars %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup()
# using dtplyr
dt <- lazy_dt(mtcars)
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = c("A",
#> "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
#> "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")[row_number()]),
#> keyby = .(am)]
# I then use the query from dtplyr
DT <- as.data.table(mtcars)
unique(DT, by = "mpg")[order(am, mpg)][, `:=`(row_num = c("A",
"B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S",
"T", "U", "V", "W", "X", "Y",
"Z")[row_number()]), keyby = .(am)]
#> row_number() should only be called in a data context
由 reprex package (v0.3.0)
于 2019-07-14 创建我们可以使用seq_len(.N)
unique(DT, by = "mpg")[order(am, mpg)][,
`:=`(row_num = LETTERS[seq_len(.N)]), by = .(am)][]
我可以推荐 rowid 函数吗?它执行分组步骤 "under the hood" 您可能会发现它看起来更干净:
unique(DT, by='mpg')[order(am, mpg), row_num := LETTERS[rowid(am)]]
如果你喜欢链接,你也可以把所有东西都放在里面 []
:
DT[ , .SD[1L], by = mpg
][order(am, mpg), row_num := LETTERS[rowid(am)]]
我正在尝试对翻译进行一些调整,以便 dtplyr 自动生成更像您想要的内容:
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
dt <- lazy_dt(mtcars)
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
group_by(am) %>%
arrange(mpg, .by_group = TRUE) %>%
mutate(row_num = LETTERS[row_number()]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = ..LETTERS[seq_len(.N)]),
#> keyby = .(am)]
或者像@MichaelChirico 建议的那样避免分组:
dt %>%
distinct(mpg, .keep_all = TRUE) %>%
arrange(am, mpg) %>%
mutate(row_num = LETTERS[row_number(am)]) %>%
ungroup() %>%
show_query()
#> unique(`_DT1`, by = "mpg")[order(am, mpg)][, `:=`(row_num = ..LETTERS[frank(am,
#> ties.method = "first", na.last = "keep")])]
(在 LETTERS
前面使用 ..
是一项 data.table 功能,可以清楚地表明您指的是数据框之外的变量;它可能不是在这里是必要的,但我认为安全总比后悔好。)
由于 data.table
语法受到严重批评,下面是 akrun answer 的两个版本,恕我直言,语法更清晰。
我发现当 data.table
代码多次使用 [
进行管道传输时,特别是当有 :=
调用时(mutate
在 dplyr 中),我发现它更难理解。
library(data.table)
dt = as.data.table(mtcars)
dt = unique(dt, by = "mpg")
dt = dt[order(am, mpg)]
dt[, row_num:=LETTERS[seq_len(.N)], by=am]
dt[1:3]
mpg cyl disp hp drat wt qsec vs am gear carb row_num
1: 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4 A
2: 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4 B
3: 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4 C
另一种选择是使用 %>%
管道。
library(magrittr)
dt = as.data.table(mtcars)
dt = unique(dt, by = "mpg") %>%
.[order(am, mpg)] %>%
.[, row_num:=LETTERS[seq_len(.N)], by=am]
dt[1:3]
# mpg cyl disp hp drat wt qsec vs am gear carb row_num
# 1: 10.4 8 472 205 2.93 5.25 17.98 0 0 3 4 A
# 2: 13.3 8 350 245 3.73 3.84 15.41 0 0 3 4 B
# 3: 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4 C