用来自另一个数据框的数据改变数据框
Mutate data frame with data from another data frame
可重现数据:
df1 <- tibble(id = c("GR1","GR2"),
area = c("A1","A2"),
date1 = as.Date(c("2022-01-01","2022-01-02")),
date2 = as.Date(c("2022-01-06","2022-01-08")))
set.seed(543)
df2 <- tibble(date3 = seq(as.Date("2022-01-01"), as.Date("2022-01-09"), "days"),
temperature =runif(9, min = 28, max = 33),
area = c("A1","A2","A1","A2","A1","A2","A1","A2","A1"))
您好,
我想在 df1 中创建一个列,其中的平均温度导致来自 df2 的过滤器。 (在真实数据框中,我在 df1 中有 1036 行,在 df2 中有 26192 行。)
我试过这种方法,但它并没有像我想的那样起作用
df3 <- df1 %>%
group_by(area) %>%
mutate(average_temp = mean(filter(.data = df2, date3 >= df1$date1 & date3 <= df1$date2 & area == df1$area)$temperature))
我收到这个错误
警告消息:
1:计算 average_temp = mean(...)
时出现问题。
我较长的对象长度不是较短对象长度的倍数
预期结果是
id
area
date1
date2
average_temp
GR1
A1
2022-01-01
2022-02-12
31.58708
GR2
A2
2022-01-02
2022-02-11
30.50867
这段代码本身给出了预期的结果。所以问题一定是我在使用 mutate 和 dplyr 语法中的逻辑的行迭代中没有看到的东西。
mean(filter(.data = df2, date3 >= df1$date1[2] & date3 <= df1$date2[2] & area == df1$area[2])$temperature)
这是一个 non-equi 或 range-based 连接。不幸的是, dplyr
本身不能做到这一点,所以我们需要另一个包的帮助。以下选项:
模糊连接
fuzzyjoin::fuzzy_left_join(
df1, df2,
by = c("area", date1="date3", date2="date3"),
match_fun=list(`==`, `<=`, `>=`)
) %>%
group_by(id, date1, date2) %>%
summarize(
area = area.x[1],
avg = mean(temperature)
) %>%
ungroup()
# `summarise()` has grouped output by 'id', 'date1'. You can override using the `.groups` argument.
# # A tibble: 2 x 5
# id date1 date2 area avg
# <chr> <date> <date> <chr> <dbl>
# 1 GR1 2022-01-01 2022-01-06 A1 31.6
# 2 GR2 2022-01-02 2022-01-08 A2 30.5
data.table
library(data.table)
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
DT1[DT2, avg := ave(i.temperature, id, FUN = mean),
on = .(area, date1 <= date3, date2 >= date3) ]
# id area date1 date2 avg
# <char> <char> <Date> <Date> <num>
# 1: GR1 A1 2022-01-01 2022-01-06 31.58708
# 2: GR2 A2 2022-01-02 2022-01-08 30.50867
(我 知道 没有 ave
有更规范的方法来做到这一点,但我 运行 没时间了...)
sqldf
# library(sqldf) # not required to load, per se
sqldf::sqldf(
"select df1.id, df1.area, df1.date1, df1.date2,
avg(df2.temperature) as avg
from df1
left join df2 on df1.area=df2.area
and df2.date3 between df1.date1 and df1.date2
group by df1.id, df1.area, df1.date1, df1.date2")
# id area date1 date2 avg
# 1 GR1 A1 2022-01-01 2022-01-06 31.58708
# 2 GR2 A2 2022-01-02 2022-01-08 30.50867
可重现数据:
df1 <- tibble(id = c("GR1","GR2"),
area = c("A1","A2"),
date1 = as.Date(c("2022-01-01","2022-01-02")),
date2 = as.Date(c("2022-01-06","2022-01-08")))
set.seed(543)
df2 <- tibble(date3 = seq(as.Date("2022-01-01"), as.Date("2022-01-09"), "days"),
temperature =runif(9, min = 28, max = 33),
area = c("A1","A2","A1","A2","A1","A2","A1","A2","A1"))
您好, 我想在 df1 中创建一个列,其中的平均温度导致来自 df2 的过滤器。 (在真实数据框中,我在 df1 中有 1036 行,在 df2 中有 26192 行。)
我试过这种方法,但它并没有像我想的那样起作用
df3 <- df1 %>%
group_by(area) %>%
mutate(average_temp = mean(filter(.data = df2, date3 >= df1$date1 & date3 <= df1$date2 & area == df1$area)$temperature))
我收到这个错误
警告消息:
1:计算 average_temp = mean(...)
时出现问题。
我较长的对象长度不是较短对象长度的倍数
预期结果是
id | area | date1 | date2 | average_temp |
---|---|---|---|---|
GR1 | A1 | 2022-01-01 | 2022-02-12 | 31.58708 |
GR2 | A2 | 2022-01-02 | 2022-02-11 | 30.50867 |
这段代码本身给出了预期的结果。所以问题一定是我在使用 mutate 和 dplyr 语法中的逻辑的行迭代中没有看到的东西。
mean(filter(.data = df2, date3 >= df1$date1[2] & date3 <= df1$date2[2] & area == df1$area[2])$temperature)
这是一个 non-equi 或 range-based 连接。不幸的是, dplyr
本身不能做到这一点,所以我们需要另一个包的帮助。以下选项:
模糊连接
fuzzyjoin::fuzzy_left_join(
df1, df2,
by = c("area", date1="date3", date2="date3"),
match_fun=list(`==`, `<=`, `>=`)
) %>%
group_by(id, date1, date2) %>%
summarize(
area = area.x[1],
avg = mean(temperature)
) %>%
ungroup()
# `summarise()` has grouped output by 'id', 'date1'. You can override using the `.groups` argument.
# # A tibble: 2 x 5
# id date1 date2 area avg
# <chr> <date> <date> <chr> <dbl>
# 1 GR1 2022-01-01 2022-01-06 A1 31.6
# 2 GR2 2022-01-02 2022-01-08 A2 30.5
data.table
library(data.table)
DT1 <- as.data.table(df1)
DT2 <- as.data.table(df2)
DT1[DT2, avg := ave(i.temperature, id, FUN = mean),
on = .(area, date1 <= date3, date2 >= date3) ]
# id area date1 date2 avg
# <char> <char> <Date> <Date> <num>
# 1: GR1 A1 2022-01-01 2022-01-06 31.58708
# 2: GR2 A2 2022-01-02 2022-01-08 30.50867
(我 知道 没有 ave
有更规范的方法来做到这一点,但我 运行 没时间了...)
sqldf
# library(sqldf) # not required to load, per se
sqldf::sqldf(
"select df1.id, df1.area, df1.date1, df1.date2,
avg(df2.temperature) as avg
from df1
left join df2 on df1.area=df2.area
and df2.date3 between df1.date1 and df1.date2
group by df1.id, df1.area, df1.date1, df1.date2")
# id area date1 date2 avg
# 1 GR1 A1 2022-01-01 2022-01-06 31.58708
# 2 GR2 A2 2022-01-02 2022-01-08 30.50867