按年份简化 Dataframe 并计算百分比变化
Simplify Dataframe by Year and Calculate Percent Change
我有两个问题:
您推荐阅读哪些资源来提高数据处理能力?我一直在处理更大的数据集,并且一直在努力适应——我觉得我正在碰壁,不知道去哪里看(很多在线资源在没有建立基础的情况下变得太复杂了)。
例如,我正在尝试解决这个问题。我有一个包含数百万行的 df,我正在尝试简化它并分析趋势。我有一个输入示例。我试图隔离每个 ID 并获取给定年份的最小值。 (有些 ID 的年份不适用于其他 ID)。简化该数据后,我试图添加一个百分比变化列。鉴于这是一个 20 多年的时间序列,此时我可以忽略月份,因为一年的最小值与另一年的最小值相比应该产生合理的百分比变化。
谢谢!
输入:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("a", "b"), class = "factor"), Date = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 5L,
6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 15L, 16L), .Label = c("2/21/2009",
"2/22/2009", "2/23/2009", "2/24/2009", "2/25/2009", "2/26/2009",
"3/2/2011", "3/3/2011", "3/4/2011", "3/5/2010", "3/5/2011", "3/6/2010",
"3/6/2011", "3/7/2010", "3/7/2011", "3/8/2011"), class = "factor"),
Year = c(2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2010L,
2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2009L, 2009L,
2010L, 2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L,
2011L), Value = c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 5, 6, 7, 8, 8, 9, 10, 11, 12, 15, 23, 25, 27)), .Names = c("ID",
"Date", "Year", "Value"), class = "data.frame", row.names = c(NA,
-26L))
预期输出:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), Date = structure(c(1L, 4L, 5L, 2L, 4L,
3L), .Label = c("2/21/2009", "2/25/2009", "3/2/2011", "3/5/2010",
"3/6/2011"), class = "factor"), Year = c(2009L, 2010L, 2011L,
2009L, 2010L, 2011L), Value = c(10, 16, 5, 6, 8, 10), Percent.Increase = c(NA,
0.6, -0.6875, NA, 0.333333333, 0.25)), .Names = c("ID", "Date",
"Year", "Value", "Percent.Increase"), class = "data.frame", row.names = c(NA,
-6L))
按 'ID'、'Year' 分组后,我们 slice
每个组中的 min
"Value" 行,然后按 'ID' 分组,我们通过从 'Value' 的 lag
中减去 'Value' 并除以 'Value' 的 lag
来创建 'Percent.Increase'。
res <- df1 %>%
group_by(ID, Year) %>%
slice(which.min(Value)) %>%
group_by(ID) %>%
mutate(Percent.Increase = (Value-lag(Value))/lag(Value))
直到 HAVING clause 在 data.table 中实现,这似乎是非常有效的方法:
dt[dt[, .I[which.min(Value)],, .(ID, Year)]$V1
][, Percent_Increase := {
tmp <- shift(Value)
(Value-tmp)/tmp
}, .(ID)]
检查 5e7 的时间。
library(dplyr)
library(data.table)
N = 5e7
set.seed(1)
df = data.frame(ID = sample(2L, N, TRUE),
Date = sample(16L, N, TRUE),
Year = sample(2009:2011, N, TRUE),
Value = sample(N/10, N, TRUE))
dt = as.data.table(df)
system.time(
res <- df %>%
group_by(ID, Year) %>%
slice(which.min(Value)) %>%
group_by(ID) %>%
mutate(Percent_Increase = (Value-lag(Value))/lag(Value))
)
# user system elapsed
# 1.676 2.176 3.847
system.time(
r <- dt[dt[, .I[which.min(Value)],, .(ID, Year)]$V1,
][, Percent_Increase := {
tmp <- shift(Value)
(Value-tmp)/tmp
}, .(ID)]
)
# user system elapsed
# 0.940 0.460 1.334
all.equal(r, as.data.table(res), ignore.col.order = TRUE, check.attributes = FALSE, ignore.row.order = TRUE)
#[1] TRUE
我有两个问题: 您推荐阅读哪些资源来提高数据处理能力?我一直在处理更大的数据集,并且一直在努力适应——我觉得我正在碰壁,不知道去哪里看(很多在线资源在没有建立基础的情况下变得太复杂了)。
例如,我正在尝试解决这个问题。我有一个包含数百万行的 df,我正在尝试简化它并分析趋势。我有一个输入示例。我试图隔离每个 ID 并获取给定年份的最小值。 (有些 ID 的年份不适用于其他 ID)。简化该数据后,我试图添加一个百分比变化列。鉴于这是一个 20 多年的时间序列,此时我可以忽略月份,因为一年的最小值与另一年的最小值相比应该产生合理的百分比变化。
谢谢!
输入:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("a", "b"), class = "factor"), Date = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 5L,
6L, 10L, 12L, 14L, 7L, 8L, 9L, 11L, 13L, 15L, 16L), .Label = c("2/21/2009",
"2/22/2009", "2/23/2009", "2/24/2009", "2/25/2009", "2/26/2009",
"3/2/2011", "3/3/2011", "3/4/2011", "3/5/2010", "3/5/2011", "3/6/2010",
"3/6/2011", "3/7/2010", "3/7/2011", "3/8/2011"), class = "factor"),
Year = c(2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2010L,
2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2009L, 2009L,
2010L, 2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L,
2011L), Value = c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 5, 6, 7, 8, 8, 9, 10, 11, 12, 15, 23, 25, 27)), .Names = c("ID",
"Date", "Year", "Value"), class = "data.frame", row.names = c(NA,
-26L))
预期输出:
structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), Date = structure(c(1L, 4L, 5L, 2L, 4L,
3L), .Label = c("2/21/2009", "2/25/2009", "3/2/2011", "3/5/2010",
"3/6/2011"), class = "factor"), Year = c(2009L, 2010L, 2011L,
2009L, 2010L, 2011L), Value = c(10, 16, 5, 6, 8, 10), Percent.Increase = c(NA,
0.6, -0.6875, NA, 0.333333333, 0.25)), .Names = c("ID", "Date",
"Year", "Value", "Percent.Increase"), class = "data.frame", row.names = c(NA,
-6L))
按 'ID'、'Year' 分组后,我们 slice
每个组中的 min
"Value" 行,然后按 'ID' 分组,我们通过从 'Value' 的 lag
中减去 'Value' 并除以 'Value' 的 lag
来创建 'Percent.Increase'。
res <- df1 %>%
group_by(ID, Year) %>%
slice(which.min(Value)) %>%
group_by(ID) %>%
mutate(Percent.Increase = (Value-lag(Value))/lag(Value))
直到 HAVING clause 在 data.table 中实现,这似乎是非常有效的方法:
dt[dt[, .I[which.min(Value)],, .(ID, Year)]$V1
][, Percent_Increase := {
tmp <- shift(Value)
(Value-tmp)/tmp
}, .(ID)]
检查 5e7 的时间。
library(dplyr)
library(data.table)
N = 5e7
set.seed(1)
df = data.frame(ID = sample(2L, N, TRUE),
Date = sample(16L, N, TRUE),
Year = sample(2009:2011, N, TRUE),
Value = sample(N/10, N, TRUE))
dt = as.data.table(df)
system.time(
res <- df %>%
group_by(ID, Year) %>%
slice(which.min(Value)) %>%
group_by(ID) %>%
mutate(Percent_Increase = (Value-lag(Value))/lag(Value))
)
# user system elapsed
# 1.676 2.176 3.847
system.time(
r <- dt[dt[, .I[which.min(Value)],, .(ID, Year)]$V1,
][, Percent_Increase := {
tmp <- shift(Value)
(Value-tmp)/tmp
}, .(ID)]
)
# user system elapsed
# 0.940 0.460 1.334
all.equal(r, as.data.table(res), ignore.col.order = TRUE, check.attributes = FALSE, ignore.row.order = TRUE)
#[1] TRUE