如何根据 R 中的日期合并两个数据集 - 代码
How to merge two datasets according to DATE in R - with code
我正在尝试执行以下操作。我有一个从 2015-01-31 到 2021-06-30 的数据集 1:
dataset1_dates=c("2015-01-31","2015-02-28","2015-03-31","2015-04-30","2015-05-31","2015-06-30","2015-07-31","2015-08-31","2015-09-30","2015-10-31","2015-11-30","2015-12-31","2016-01-31","2016-02-29","2016-03-31","2016-04-30","2016-05-31","2016-06-30","2016-07-31","2016-08-31","2016-09-30","2016-10-31","2016-11-30","2016-12-31","2017-01-31","2017-02-28","2017-03-31","2017-04-30","2017-05-31","2017-06-30","2017-07-31","2017-08-31","2017-09-30","2017-10-31","2017-11-30","2017-12-31","2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset1 <- expand.grid(Organisation = c("A123","B234","C456"),
Date = dataset1_dates)
## sort
dataset1 <- dataset1[order(dataset1$Organisation, dataset1$Date),]
## reset id
rownames(dataset1) <- NULL
dataset1$Organisation <- as.character(dataset1$Organisation)
dataset1$Date <- as.Date(dataset1$Date, format="%Y-%m-%d")
然后我有一个数据集2,它告诉我在特定时间点每个组织在检查时的表现:
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected but not rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
dataset2$Organisation <- as.character(dataset2$Organisation)
dataset2$Date_inspection <- as.Date(dataset2$Date_inspection, format="%Y-%m-%d")
dataset2$Performance <- as.character(dataset2$Performance)
我想分配给每个月after检查,包括检查的月份,组织的绩效类别。
我还想将第一次检查前的月份视为与第一次检查时的性能类别相同。
预期结果:
Date | Organisation | Performance |
2015-01-31 | A123 | Good |
2015-02-28 | A123 | Good |
2015-03-31 | A123 | Good |
...
2016-01-31 | A123 | OK |
...
2021-06-30 | A123 | OK |
2015-01-31 | B234 | Inadequate |
2015-02-28 | B234 | Inadequate |
2015-03-31 | B234 | Inadequate |
...
2021-06-30 | B234 | Inadequate |
2015-01-31 | C456 | OK |
2015-02-28 | C456 | OK |
2015-03-31 | C456 | OK |
...
2016-02-29 | C456 | OK |
...
2018-05-31 | C456 | Good |
2018-06-30 | C456 | Good |
...
2020-03-31 | C456 | OK |
...
2021-06-30 | C456 | OK |
关于如何在 R 中执行此操作的任何想法?
您的方法是正确的,但是您的 dataset1
和 dataset2
都不包含列 Location
。
如果您的主要数据中也缺少此列,那么这可能是调查的第一步。
如果需要,我可以编辑我的答案。
这是一种使用 dplyr 的方法。请注意,这需要 Organisation
在两个数据集中都是 character
(即不使用 as.factor
进行转换)。
lookup <- function(x, y) {
dataset2 %>%
filter(Organisation == x, Date_inspection <= y) %>%
pull(Performance) %>%
last(
default = dataset2 %>%
filter(Organisation == x) %>%
slice_min(Date_inspection) %>%
pull(Performance)
)
}
# add `Performance` by applying `lookup` over `organisation` and `Date`
dataset1 %>%
mutate(Performance = map2_chr(Organisation, Date, lookup))
想法是使用函数 lookup
,pull
是组织上次检查的 Performance
值。如果该值不存在(因为没有 Date_inspection <= y
),我们将使用该组织的第一个检查日期。
关于如何补救损坏的 read.table()
表情的注意事项:
当前,Performance 列值中的空格导致在将其解析为 table 时出错。一个简单的补救措施是在导入之前重新编码您的值,如下所示(请注意,字符串“Inspected but not rated”中的空格已替换为“_”,结果值为“Inspected_but_not_rated”)。
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected_but_not_rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
我们现在可以将空格插入回字符串中,如下所示:
dataset2$Performance <- with(
dataset2,
gsub("_", " ", Performance)
)
生成您在下面看到的 dataset2
对象。
Base R(修正)解决方案(根据评论中的其他要求):
首先,您的 dataset2
data.frame
对象现在已损坏,因此我们将开始使用:
dataset2 <- structure(list(Organisation = c("A123", "A123", "B234", "C456",
"C456", "C456", "C456"), Date_inspection = structure(c(16466,
16814, 17331, 16616, 16841, 17669, 18342), class = "Date"), Performance = c("Good",
"OK", "Inadequate", "OK", "Inspected but not rated", "Good",
"OK")), row.names = c(NA, -7L), class = "data.frame")
其次,在这个修改后的案例中,我们所要做的就是将“已检查但未评级”重新编码为 NA_character_
。请参阅下面修改后的解决方案:
# Recode Inspected but not rated to an NA of type
# character: clean_df2 => data.frame
clean_df2 <- transform(
dataset2,
Performance = gsub(
"Inspected but not rated",
NA_character_,
Performance
)
)
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
clean_df2,
split(
clean_df2,
Organisation
)
),
function(x){
x$Month_inspected <- as.Date(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
x$MinMonthInData <- as.Date(
strftime(
min(
dataset1$Date[
match(
x$Organisation,
dataset1$Organisation
)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(unique(x$MinMonthInData)),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[
which.max(
!(
is.na(
x$Performance
)
)
)
],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, and forward fill,
# the inspection category: ir_res => data.frame
res <- within(
merge(
transform(
with(
dataset1,
dataset1[order(Organisation, Date),]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
inspectionsApplicable,
by = c(
"Organisation",
"Months"
),
all.x = TRUE
),
{
Performance <- na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
rm(Months)
}
)
基础 R(原始)解决方案:
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
dataset2,
split(
dataset2,
Organisation
)
),
function(x){
x$Month_inspected <- as.POSIXlt(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
x$MinMonthInData <- as.Date(
strftime(
min(
dataset1$Date[
match(
x$Organisation,
dataset1$Organisation
)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(unique(x$MinMonthInData)),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[1],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, and forward fill,
# the inspection category: ir_res => data.frame
res <- transform(
merge(
transform(
with(
dataset1,
dataset1[order(Date),]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
inspectionsApplicable,
by = c(
"Organisation",
"Months"
),
all.x = TRUE
),
Performance = na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
)
我正在尝试执行以下操作。我有一个从 2015-01-31 到 2021-06-30 的数据集 1:
dataset1_dates=c("2015-01-31","2015-02-28","2015-03-31","2015-04-30","2015-05-31","2015-06-30","2015-07-31","2015-08-31","2015-09-30","2015-10-31","2015-11-30","2015-12-31","2016-01-31","2016-02-29","2016-03-31","2016-04-30","2016-05-31","2016-06-30","2016-07-31","2016-08-31","2016-09-30","2016-10-31","2016-11-30","2016-12-31","2017-01-31","2017-02-28","2017-03-31","2017-04-30","2017-05-31","2017-06-30","2017-07-31","2017-08-31","2017-09-30","2017-10-31","2017-11-30","2017-12-31","2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset1 <- expand.grid(Organisation = c("A123","B234","C456"),
Date = dataset1_dates)
## sort
dataset1 <- dataset1[order(dataset1$Organisation, dataset1$Date),]
## reset id
rownames(dataset1) <- NULL
dataset1$Organisation <- as.character(dataset1$Organisation)
dataset1$Date <- as.Date(dataset1$Date, format="%Y-%m-%d")
然后我有一个数据集2,它告诉我在特定时间点每个组织在检查时的表现:
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected but not rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
dataset2$Organisation <- as.character(dataset2$Organisation)
dataset2$Date_inspection <- as.Date(dataset2$Date_inspection, format="%Y-%m-%d")
dataset2$Performance <- as.character(dataset2$Performance)
我想分配给每个月after检查,包括检查的月份,组织的绩效类别。
我还想将第一次检查前的月份视为与第一次检查时的性能类别相同。
预期结果:
Date | Organisation | Performance |
2015-01-31 | A123 | Good |
2015-02-28 | A123 | Good |
2015-03-31 | A123 | Good |
...
2016-01-31 | A123 | OK |
...
2021-06-30 | A123 | OK |
2015-01-31 | B234 | Inadequate |
2015-02-28 | B234 | Inadequate |
2015-03-31 | B234 | Inadequate |
...
2021-06-30 | B234 | Inadequate |
2015-01-31 | C456 | OK |
2015-02-28 | C456 | OK |
2015-03-31 | C456 | OK |
...
2016-02-29 | C456 | OK |
...
2018-05-31 | C456 | Good |
2018-06-30 | C456 | Good |
...
2020-03-31 | C456 | OK |
...
2021-06-30 | C456 | OK |
关于如何在 R 中执行此操作的任何想法?
您的方法是正确的,但是您的 dataset1
和 dataset2
都不包含列 Location
。
如果您的主要数据中也缺少此列,那么这可能是调查的第一步。
如果需要,我可以编辑我的答案。
这是一种使用 dplyr 的方法。请注意,这需要 Organisation
在两个数据集中都是 character
(即不使用 as.factor
进行转换)。
lookup <- function(x, y) {
dataset2 %>%
filter(Organisation == x, Date_inspection <= y) %>%
pull(Performance) %>%
last(
default = dataset2 %>%
filter(Organisation == x) %>%
slice_min(Date_inspection) %>%
pull(Performance)
)
}
# add `Performance` by applying `lookup` over `organisation` and `Date`
dataset1 %>%
mutate(Performance = map2_chr(Organisation, Date, lookup))
想法是使用函数 lookup
,pull
是组织上次检查的 Performance
值。如果该值不存在(因为没有 Date_inspection <= y
),我们将使用该组织的第一个检查日期。
关于如何补救损坏的 read.table()
表情的注意事项:
当前,Performance 列值中的空格导致在将其解析为 table 时出错。一个简单的补救措施是在导入之前重新编码您的值,如下所示(请注意,字符串“Inspected but not rated”中的空格已替换为“_”,结果值为“Inspected_but_not_rated”)。
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected_but_not_rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
我们现在可以将空格插入回字符串中,如下所示:
dataset2$Performance <- with(
dataset2,
gsub("_", " ", Performance)
)
生成您在下面看到的 dataset2
对象。
Base R(修正)解决方案(根据评论中的其他要求):
首先,您的 dataset2
data.frame
对象现在已损坏,因此我们将开始使用:
dataset2 <- structure(list(Organisation = c("A123", "A123", "B234", "C456",
"C456", "C456", "C456"), Date_inspection = structure(c(16466,
16814, 17331, 16616, 16841, 17669, 18342), class = "Date"), Performance = c("Good",
"OK", "Inadequate", "OK", "Inspected but not rated", "Good",
"OK")), row.names = c(NA, -7L), class = "data.frame")
其次,在这个修改后的案例中,我们所要做的就是将“已检查但未评级”重新编码为 NA_character_
。请参阅下面修改后的解决方案:
# Recode Inspected but not rated to an NA of type
# character: clean_df2 => data.frame
clean_df2 <- transform(
dataset2,
Performance = gsub(
"Inspected but not rated",
NA_character_,
Performance
)
)
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
clean_df2,
split(
clean_df2,
Organisation
)
),
function(x){
x$Month_inspected <- as.Date(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
x$MinMonthInData <- as.Date(
strftime(
min(
dataset1$Date[
match(
x$Organisation,
dataset1$Organisation
)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(unique(x$MinMonthInData)),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[
which.max(
!(
is.na(
x$Performance
)
)
)
],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, and forward fill,
# the inspection category: ir_res => data.frame
res <- within(
merge(
transform(
with(
dataset1,
dataset1[order(Organisation, Date),]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
inspectionsApplicable,
by = c(
"Organisation",
"Months"
),
all.x = TRUE
),
{
Performance <- na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
rm(Months)
}
)
基础 R(原始)解决方案:
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
dataset2,
split(
dataset2,
Organisation
)
),
function(x){
x$Month_inspected <- as.POSIXlt(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
x$MinMonthInData <- as.Date(
strftime(
min(
dataset1$Date[
match(
x$Organisation,
dataset1$Organisation
)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(unique(x$MinMonthInData)),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[1],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, and forward fill,
# the inspection category: ir_res => data.frame
res <- transform(
merge(
transform(
with(
dataset1,
dataset1[order(Date),]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
inspectionsApplicable,
by = c(
"Organisation",
"Months"
),
all.x = TRUE
),
Performance = na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
)