如何根据 DATE 合并 2 个数据集
How to merge 2 datasets according to DATE
我正在尝试执行以下操作。我有一个从 2015-01-31 到 2021-06-30 的数据集 1:
dataset1_dates=c("2015-01-31","2015-02-28","2015-03-31","2015-04-30","2015-05-31","2015-06-30","2015-07-31","2015-08-31","2015-09-30","2015-10-31","2015-11-30","2015-12-31","2016-01-31","2016-02-29","2016-03-31","2016-04-30","2016-05-31","2016-06-30","2016-07-31","2016-08-31","2016-09-30","2016-10-31","2016-11-30","2016-12-31","2017-01-31","2017-02-28","2017-03-31","2017-04-30","2017-05-31","2017-06-30","2017-07-31","2017-08-31","2017-09-30","2017-10-31","2017-11-30","2017-12-31","2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset1 <- expand.grid(Organisation = c("A123","B234","C456"),
Date = dataset1_dates)
## sort
dataset1 <- dataset1[order(dataset1$Organisation, dataset1$Date),]
## reset id
rownames(dataset1) <- NULL
dataset1$Organisation <- as.character(dataset1$Organisation)
dataset1$Date <- as.Date(dataset1$Date, format="%Y-%m-%d")
然后我有一个dataset2告诉我在特定时间点每个组织在检查时的表现:
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected but not rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
dataset2$Organisation <- as.character(dataset2$Organisation)
dataset2$Date_inspection <- as.Date(dataset2$Date_inspection, format="%Y-%m-%d")
dataset2$Performance <- as.character(dataset2$Performance)
我想分配给每个月before检查,包括检查的月份,组织的绩效类别。
我还想将上次检查后的几个月视为等于上次检查日期的性能类别。
当'Inspected but not rated' 时,则假设下一个类别。例如。对于 C456 然后假设 'Good'.
预期结果:
Date | Organisation | Performance |
2015-01-31 | A123 | Good |
2015-02-28 | A123 | OK |
2015-03-31 | A123 | OK |
...
2016-01-31 | A123 | OK |
...
2021-06-30 | A123 | OK |
2015-01-31 | B234 | Inadequate |
2015-02-28 | B234 | Inadequate |
2015-03-31 | B234 | Inadequate |
...
2021-06-30 | B234 | Inadequate |
2015-01-31 | C456 | OK |
2015-02-28 | C456 | OK |
2015-03-31 | C456 | OK |
...
2015-06-30 | C456 | OK |
...
2016-02-29 | C456 | Good |
...
2018-05-31 | C456 | Good |
2018-06-30 | C456 | OK |
...
2020-03-31 | C456 | OK |
...
2021-06-30 | C456 | OK |
关于如何在 R 中执行此操作的任何想法?
编辑:更正了一个错误。忘记分组数据了。
Edit2: 错过了“已检查但未评级”的处理。谢谢@hello_friend!
我想你可以用 dplyr
和 tidyr
处理这个问题:
library(dplyr)
library(tidyr)
dataset1 %>%
mutate(year_month = format(Date, "%Y-%m")) %>%
left_join(
dataset2 %>%
mutate(year_month = format(Date_inspection, "%Y-%m"),
Performance = na_if(Performance, "Inspected but not rated")),
by = c("Organisation", "year_month")
) %>%
group_by(Organisation) %>%
fill(Performance, .direction = "updown") %>%
select(-year_month, -Date_inspection) %>%
ungroup()
这个returns
# A tibble: 234 x 3
Organisation Date Performance
<chr> <date> <chr>
1 A123 2015-01-31 Good
2 A123 2015-02-28 OK
3 A123 2015-03-31 OK
4 A123 2015-04-30 OK
5 A123 2015-05-31 OK
6 A123 2015-06-30 OK
7 A123 2015-07-31 OK
8 A123 2015-08-31 OK
9 A123 2015-09-30 OK
10 A123 2015-10-31 OK
11 A123 2015-11-30 OK
12 A123 2015-12-31 OK
13 A123 2016-01-31 OK
14 A123 2016-02-29 OK
15 A123 2016-03-31 OK
16 A123 2016-04-30 OK
17 A123 2016-05-31 OK
18 A123 2016-06-30 OK
19 A123 2016-07-31 OK
20 A123 2016-08-31 OK
21 A123 2016-09-30 OK
22 A123 2016-10-31 OK
23 A123 2016-11-30 OK
24 A123 2016-12-31 OK
25 A123 2017-01-31 OK
26 A123 2017-02-28 OK
27 A123 2017-03-31 OK
28 A123 2017-04-30 OK
29 A123 2017-05-31 OK
30 A123 2017-06-30 OK
31 A123 2017-07-31 OK
32 A123 2017-08-31 OK
33 A123 2017-09-30 OK
34 A123 2017-10-31 OK
35 A123 2017-11-30 OK
36 A123 2017-12-31 OK
37 A123 2018-01-31 OK
38 A123 2018-02-28 OK
39 A123 2018-03-31 OK
40 A123 2018-04-30 OK
41 A123 2018-05-31 OK
42 A123 2018-06-30 OK
43 A123 2018-07-31 OK
44 A123 2018-08-31 OK
45 A123 2018-09-30 OK
46 A123 2018-10-31 OK
47 A123 2018-11-30 OK
48 A123 2018-12-31 OK
49 A123 2019-01-31 OK
50 A123 2019-02-28 OK
51 A123 2019-03-31 OK
52 A123 2019-04-30 OK
53 A123 2019-05-31 OK
54 A123 2019-06-30 OK
55 A123 2019-07-31 OK
56 A123 2019-08-31 OK
57 A123 2019-09-30 OK
58 A123 2019-10-31 OK
59 A123 2019-11-30 OK
60 A123 2019-12-31 OK
61 A123 2020-01-31 OK
62 A123 2020-02-29 OK
63 A123 2020-03-31 OK
64 A123 2020-04-30 OK
65 A123 2020-05-31 OK
66 A123 2020-06-30 OK
67 A123 2020-07-31 OK
68 A123 2020-08-31 OK
69 A123 2020-09-30 OK
70 A123 2020-10-31 OK
71 A123 2020-11-30 OK
72 A123 2020-12-31 OK
73 A123 2021-01-31 OK
74 A123 2021-02-28 OK
75 A123 2021-03-31 OK
76 A123 2021-04-30 OK
77 A123 2021-05-31 OK
78 A123 2021-06-30 OK
79 B234 2015-01-31 Inadequate
80 B234 2015-02-28 Inadequate
81 B234 2015-03-31 Inadequate
82 B234 2015-04-30 Inadequate
83 B234 2015-05-31 Inadequate
84 B234 2015-06-30 Inadequate
85 B234 2015-07-31 Inadequate
86 B234 2015-08-31 Inadequate
87 B234 2015-09-30 Inadequate
88 B234 2015-10-31 Inadequate
89 B234 2015-11-30 Inadequate
90 B234 2015-12-31 Inadequate
91 B234 2016-01-31 Inadequate
92 B234 2016-02-29 Inadequate
93 B234 2016-03-31 Inadequate
94 B234 2016-04-30 Inadequate
95 B234 2016-05-31 Inadequate
96 B234 2016-06-30 Inadequate
97 B234 2016-07-31 Inadequate
98 B234 2016-08-31 Inadequate
99 B234 2016-09-30 Inadequate
100 B234 2016-10-31 Inadequate
101 B234 2016-11-30 Inadequate
102 B234 2016-12-31 Inadequate
103 B234 2017-01-31 Inadequate
104 B234 2017-02-28 Inadequate
105 B234 2017-03-31 Inadequate
106 B234 2017-04-30 Inadequate
107 B234 2017-05-31 Inadequate
108 B234 2017-06-30 Inadequate
109 B234 2017-07-31 Inadequate
110 B234 2017-08-31 Inadequate
111 B234 2017-09-30 Inadequate
112 B234 2017-10-31 Inadequate
113 B234 2017-11-30 Inadequate
114 B234 2017-12-31 Inadequate
115 B234 2018-01-31 Inadequate
116 B234 2018-02-28 Inadequate
117 B234 2018-03-31 Inadequate
118 B234 2018-04-30 Inadequate
119 B234 2018-05-31 Inadequate
120 B234 2018-06-30 Inadequate
121 B234 2018-07-31 Inadequate
122 B234 2018-08-31 Inadequate
123 B234 2018-09-30 Inadequate
124 B234 2018-10-31 Inadequate
125 B234 2018-11-30 Inadequate
126 B234 2018-12-31 Inadequate
127 B234 2019-01-31 Inadequate
128 B234 2019-02-28 Inadequate
129 B234 2019-03-31 Inadequate
130 B234 2019-04-30 Inadequate
131 B234 2019-05-31 Inadequate
132 B234 2019-06-30 Inadequate
133 B234 2019-07-31 Inadequate
134 B234 2019-08-31 Inadequate
135 B234 2019-09-30 Inadequate
136 B234 2019-10-31 Inadequate
137 B234 2019-11-30 Inadequate
138 B234 2019-12-31 Inadequate
139 B234 2020-01-31 Inadequate
140 B234 2020-02-29 Inadequate
141 B234 2020-03-31 Inadequate
142 B234 2020-04-30 Inadequate
143 B234 2020-05-31 Inadequate
144 B234 2020-06-30 Inadequate
145 B234 2020-07-31 Inadequate
146 B234 2020-08-31 Inadequate
147 B234 2020-09-30 Inadequate
148 B234 2020-10-31 Inadequate
149 B234 2020-11-30 Inadequate
150 B234 2020-12-31 Inadequate
151 B234 2021-01-31 Inadequate
152 B234 2021-02-28 Inadequate
153 B234 2021-03-31 Inadequate
154 B234 2021-04-30 Inadequate
155 B234 2021-05-31 Inadequate
156 B234 2021-06-30 Inadequate
157 C456 2015-01-31 OK
158 C456 2015-02-28 OK
159 C456 2015-03-31 OK
160 C456 2015-04-30 OK
161 C456 2015-05-31 OK
162 C456 2015-06-30 OK
163 C456 2015-07-31 Good
164 C456 2015-08-31 Good
165 C456 2015-09-30 Good
166 C456 2015-10-31 Good
167 C456 2015-11-30 Good
168 C456 2015-12-31 Good
169 C456 2016-01-31 Good
170 C456 2016-02-29 Good
171 C456 2016-03-31 Good
172 C456 2016-04-30 Good
173 C456 2016-05-31 Good
174 C456 2016-06-30 Good
175 C456 2016-07-31 Good
176 C456 2016-08-31 Good
177 C456 2016-09-30 Good
178 C456 2016-10-31 Good
179 C456 2016-11-30 Good
180 C456 2016-12-31 Good
181 C456 2017-01-31 Good
182 C456 2017-02-28 Good
183 C456 2017-03-31 Good
184 C456 2017-04-30 Good
185 C456 2017-05-31 Good
186 C456 2017-06-30 Good
187 C456 2017-07-31 Good
188 C456 2017-08-31 Good
189 C456 2017-09-30 Good
190 C456 2017-10-31 Good
191 C456 2017-11-30 Good
192 C456 2017-12-31 Good
193 C456 2018-01-31 Good
194 C456 2018-02-28 Good
195 C456 2018-03-31 Good
196 C456 2018-04-30 Good
197 C456 2018-05-31 Good
198 C456 2018-06-30 OK
199 C456 2018-07-31 OK
200 C456 2018-08-31 OK
201 C456 2018-09-30 OK
202 C456 2018-10-31 OK
203 C456 2018-11-30 OK
204 C456 2018-12-31 OK
205 C456 2019-01-31 OK
206 C456 2019-02-28 OK
207 C456 2019-03-31 OK
208 C456 2019-04-30 OK
209 C456 2019-05-31 OK
210 C456 2019-06-30 OK
211 C456 2019-07-31 OK
212 C456 2019-08-31 OK
213 C456 2019-09-30 OK
214 C456 2019-10-31 OK
215 C456 2019-11-30 OK
216 C456 2019-12-31 OK
217 C456 2020-01-31 OK
218 C456 2020-02-29 OK
219 C456 2020-03-31 OK
220 C456 2020-04-30 OK
221 C456 2020-05-31 OK
222 C456 2020-06-30 OK
223 C456 2020-07-31 OK
224 C456 2020-08-31 OK
225 C456 2020-09-30 OK
226 C456 2020-10-31 OK
227 C456 2020-11-30 OK
228 C456 2020-12-31 OK
229 C456 2021-01-31 OK
230 C456 2021-02-28 OK
231 C456 2021-03-31 OK
232 C456 2021-04-30 OK
233 C456 2021-05-31 OK
234 C456 2021-06-30 OK
##数据
这是问题中显示的所有转换后的数据。
dataset1 <- structure(list(Organisation = c("A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456"), Date = structure(c(16466,
16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739,
16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013,
17044, 17074, 17105, 17135, 17166, 17197, 17225, 17256, 17286,
17317, 17347, 17378, 17409, 17439, 17470, 17500, 17531, 17562,
17590, 17621, 17651, 17682, 17712, 17743, 17774, 17804, 17835,
17865, 17896, 17927, 17955, 17986, 18016, 18047, 18077, 18108,
18139, 18169, 18200, 18230, 18261, 18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 16466, 16494, 16525, 16555,
16586, 16616, 16647, 16678, 16708, 16739, 16769, 16800, 16831,
16860, 16891, 16921, 16952, 16982, 17013, 17044, 17074, 17105,
17135, 17166, 17197, 17225, 17256, 17286, 17317, 17347, 17378,
17409, 17439, 17470, 17500, 17531, 17562, 17590, 17621, 17651,
17682, 17712, 17743, 17774, 17804, 17835, 17865, 17896, 17927,
17955, 17986, 18016, 18047, 18077, 18108, 18139, 18169, 18200,
18230, 18261, 18292, 18321, 18352, 18382, 18413, 18443, 18474,
18505, 18535, 18566, 18596, 18627, 18658, 18686, 18717, 18747,
18778, 18808, 16466, 16494, 16525, 16555, 16586, 16616, 16647,
16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921,
16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166, 17197,
17225, 17256, 17286, 17317, 17347, 17378, 17409, 17439, 17470,
17500, 17531, 17562, 17590, 17621, 17651, 17682, 17712, 17743,
17774, 17804, 17835, 17865, 17896, 17927, 17955, 17986, 18016,
18047, 18077, 18108, 18139, 18169, 18200, 18230, 18261, 18292,
18321, 18352, 18382, 18413, 18443, 18474, 18505, 18535, 18566,
18596, 18627, 18658, 18686, 18717, 18747, 18778, 18808), class = "Date")), out.attrs = list(
dim = c(Organisation = 3L, Date = 78L), dimnames = list(Organisation = c("Organisation=A123",
"Organisation=B234", "Organisation=C456"), Date = c("Date=2015-01-31",
"Date=2015-02-28", "Date=2015-03-31", "Date=2015-04-30",
"Date=2015-05-31", "Date=2015-06-30", "Date=2015-07-31",
"Date=2015-08-31", "Date=2015-09-30", "Date=2015-10-31",
"Date=2015-11-30", "Date=2015-12-31", "Date=2016-01-31",
"Date=2016-02-29", "Date=2016-03-31", "Date=2016-04-30",
"Date=2016-05-31", "Date=2016-06-30", "Date=2016-07-31",
"Date=2016-08-31", "Date=2016-09-30", "Date=2016-10-31",
"Date=2016-11-30", "Date=2016-12-31", "Date=2017-01-31",
"Date=2017-02-28", "Date=2017-03-31", "Date=2017-04-30",
"Date=2017-05-31", "Date=2017-06-30", "Date=2017-07-31",
"Date=2017-08-31", "Date=2017-09-30", "Date=2017-10-31",
"Date=2017-11-30", "Date=2017-12-31", "Date=2018-01-31",
"Date=2018-02-28", "Date=2018-03-31", "Date=2018-04-30",
"Date=2018-05-31", "Date=2018-06-30", "Date=2018-07-31",
"Date=2018-08-31", "Date=2018-09-30", "Date=2018-10-31",
"Date=2018-11-30", "Date=2018-12-31", "Date=2019-01-31",
"Date=2019-02-28", "Date=2019-03-31", "Date=2019-04-30",
"Date=2019-05-31", "Date=2019-06-30", "Date=2019-07-31",
"Date=2019-08-31", "Date=2019-09-30", "Date=2019-10-31",
"Date=2019-11-30", "Date=2019-12-31", "Date=2020-01-31",
"Date=2020-02-29", "Date=2020-03-31", "Date=2020-04-30",
"Date=2020-05-31", "Date=2020-06-30", "Date=2020-07-31",
"Date=2020-08-31", "Date=2020-09-30", "Date=2020-10-31",
"Date=2020-11-30", "Date=2020-12-31", "Date=2021-01-31",
"Date=2021-02-28", "Date=2021-03-31", "Date=2021-04-30",
"Date=2021-05-31", "Date=2021-06-30"))), row.names = c(NA,
-234L), class = "data.frame")
dataset2 <- structure(list(Organisation = c("A123", "A123", "B234", "C456",
"C456", "C456", "C456"), Date_inspection = structure(c(16466,
16814, 17331, 16616, 16841, 17669, 18342), class = "Date"), Performance = c("Good",
"OK", "Inadequate", "OK", "Inspected but not rated", "Good",
"OK")), row.names = c(NA, -7L), class = "data.frame")
我相信这可以简化,但会按预期工作:
# Recode Inspected but not rated to an NA of type
# character: clean_df2 => data.frame
clean_df2 <- transform(
with(
dataset2,
dataset2[
rev(
order(
Date_inspection
)
),
]
),
Performance = gsub(
"Inspected but not rated",
NA_character_,
Performance
)
)
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
clean_df2,
split(
clean_df2,
Organisation
)
),
function(x){
x$Month_inspected <- as.Date(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
MaxMonthInData <- as.Date(
strftime(
max(
dataset1$Date[
dataset1$Organisation ==
unique(x$Organisation)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(MaxMonthInData),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[
which.min(
cumsum(
!(
is.na(
x$Performance
)
)
)
)
],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, dropping dupes from
# from inspection result data.frame: ir_res => data.frame
ir_res <- merge(
transform(
with(
dataset1,
dataset1[
rev(
order(
Organisation,
Date
)
),
]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
with(
inspectionsApplicable,
inspectionsApplicable[
!(
duplicated(
paste0(
Organisation,
Months
),
fromLast = TRUE
)
),
]
),
by = c(
"Organisation",
"Months"
),
all.x = TRUE
)
# Back fill by group: res_ir2 => data.frame
res_ir2 <- do.call(
rbind,
lapply(
with(
ir_res,
split(
ir_res,
Organisation
)
),
function(x){
y <- with(
x,
x[
rev(
order(
Date
)
),
]
)
transform(
y,
Performance = na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
)
}
)
)
# Order by date and organisation: res => data.frame
res <- data.frame(
with(
res_ir2,
res_ir2[
order(
Organisation,
Date
),
]
),
row.names = NULL
)
我正在尝试执行以下操作。我有一个从 2015-01-31 到 2021-06-30 的数据集 1:
dataset1_dates=c("2015-01-31","2015-02-28","2015-03-31","2015-04-30","2015-05-31","2015-06-30","2015-07-31","2015-08-31","2015-09-30","2015-10-31","2015-11-30","2015-12-31","2016-01-31","2016-02-29","2016-03-31","2016-04-30","2016-05-31","2016-06-30","2016-07-31","2016-08-31","2016-09-30","2016-10-31","2016-11-30","2016-12-31","2017-01-31","2017-02-28","2017-03-31","2017-04-30","2017-05-31","2017-06-30","2017-07-31","2017-08-31","2017-09-30","2017-10-31","2017-11-30","2017-12-31","2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset1 <- expand.grid(Organisation = c("A123","B234","C456"),
Date = dataset1_dates)
## sort
dataset1 <- dataset1[order(dataset1$Organisation, dataset1$Date),]
## reset id
rownames(dataset1) <- NULL
dataset1$Organisation <- as.character(dataset1$Organisation)
dataset1$Date <- as.Date(dataset1$Date, format="%Y-%m-%d")
然后我有一个dataset2告诉我在特定时间点每个组织在检查时的表现:
dataset2 <- read.table(
text = "
Organisation Date_inspection Performance
A123 2015-01-31 Good
A123 2016-01-14 OK
B234 2017-06-14 Inadequate
C456 2015-06-30 OK
C456 2016-02-10 Inspected but not rated
C456 2018-05-18 Good
C456 2020-03-21 OK",
header = TRUE)
dataset2$Organisation <- as.character(dataset2$Organisation)
dataset2$Date_inspection <- as.Date(dataset2$Date_inspection, format="%Y-%m-%d")
dataset2$Performance <- as.character(dataset2$Performance)
我想分配给每个月before检查,包括检查的月份,组织的绩效类别。
我还想将上次检查后的几个月视为等于上次检查日期的性能类别。
当'Inspected but not rated' 时,则假设下一个类别。例如。对于 C456 然后假设 'Good'.
预期结果:
Date | Organisation | Performance |
2015-01-31 | A123 | Good |
2015-02-28 | A123 | OK |
2015-03-31 | A123 | OK |
...
2016-01-31 | A123 | OK |
...
2021-06-30 | A123 | OK |
2015-01-31 | B234 | Inadequate |
2015-02-28 | B234 | Inadequate |
2015-03-31 | B234 | Inadequate |
...
2021-06-30 | B234 | Inadequate |
2015-01-31 | C456 | OK |
2015-02-28 | C456 | OK |
2015-03-31 | C456 | OK |
...
2015-06-30 | C456 | OK |
...
2016-02-29 | C456 | Good |
...
2018-05-31 | C456 | Good |
2018-06-30 | C456 | OK |
...
2020-03-31 | C456 | OK |
...
2021-06-30 | C456 | OK |
关于如何在 R 中执行此操作的任何想法?
编辑:更正了一个错误。忘记分组数据了。
Edit2: 错过了“已检查但未评级”的处理。谢谢@hello_friend!
我想你可以用 dplyr
和 tidyr
处理这个问题:
library(dplyr)
library(tidyr)
dataset1 %>%
mutate(year_month = format(Date, "%Y-%m")) %>%
left_join(
dataset2 %>%
mutate(year_month = format(Date_inspection, "%Y-%m"),
Performance = na_if(Performance, "Inspected but not rated")),
by = c("Organisation", "year_month")
) %>%
group_by(Organisation) %>%
fill(Performance, .direction = "updown") %>%
select(-year_month, -Date_inspection) %>%
ungroup()
这个returns
# A tibble: 234 x 3
Organisation Date Performance
<chr> <date> <chr>
1 A123 2015-01-31 Good
2 A123 2015-02-28 OK
3 A123 2015-03-31 OK
4 A123 2015-04-30 OK
5 A123 2015-05-31 OK
6 A123 2015-06-30 OK
7 A123 2015-07-31 OK
8 A123 2015-08-31 OK
9 A123 2015-09-30 OK
10 A123 2015-10-31 OK
11 A123 2015-11-30 OK
12 A123 2015-12-31 OK
13 A123 2016-01-31 OK
14 A123 2016-02-29 OK
15 A123 2016-03-31 OK
16 A123 2016-04-30 OK
17 A123 2016-05-31 OK
18 A123 2016-06-30 OK
19 A123 2016-07-31 OK
20 A123 2016-08-31 OK
21 A123 2016-09-30 OK
22 A123 2016-10-31 OK
23 A123 2016-11-30 OK
24 A123 2016-12-31 OK
25 A123 2017-01-31 OK
26 A123 2017-02-28 OK
27 A123 2017-03-31 OK
28 A123 2017-04-30 OK
29 A123 2017-05-31 OK
30 A123 2017-06-30 OK
31 A123 2017-07-31 OK
32 A123 2017-08-31 OK
33 A123 2017-09-30 OK
34 A123 2017-10-31 OK
35 A123 2017-11-30 OK
36 A123 2017-12-31 OK
37 A123 2018-01-31 OK
38 A123 2018-02-28 OK
39 A123 2018-03-31 OK
40 A123 2018-04-30 OK
41 A123 2018-05-31 OK
42 A123 2018-06-30 OK
43 A123 2018-07-31 OK
44 A123 2018-08-31 OK
45 A123 2018-09-30 OK
46 A123 2018-10-31 OK
47 A123 2018-11-30 OK
48 A123 2018-12-31 OK
49 A123 2019-01-31 OK
50 A123 2019-02-28 OK
51 A123 2019-03-31 OK
52 A123 2019-04-30 OK
53 A123 2019-05-31 OK
54 A123 2019-06-30 OK
55 A123 2019-07-31 OK
56 A123 2019-08-31 OK
57 A123 2019-09-30 OK
58 A123 2019-10-31 OK
59 A123 2019-11-30 OK
60 A123 2019-12-31 OK
61 A123 2020-01-31 OK
62 A123 2020-02-29 OK
63 A123 2020-03-31 OK
64 A123 2020-04-30 OK
65 A123 2020-05-31 OK
66 A123 2020-06-30 OK
67 A123 2020-07-31 OK
68 A123 2020-08-31 OK
69 A123 2020-09-30 OK
70 A123 2020-10-31 OK
71 A123 2020-11-30 OK
72 A123 2020-12-31 OK
73 A123 2021-01-31 OK
74 A123 2021-02-28 OK
75 A123 2021-03-31 OK
76 A123 2021-04-30 OK
77 A123 2021-05-31 OK
78 A123 2021-06-30 OK
79 B234 2015-01-31 Inadequate
80 B234 2015-02-28 Inadequate
81 B234 2015-03-31 Inadequate
82 B234 2015-04-30 Inadequate
83 B234 2015-05-31 Inadequate
84 B234 2015-06-30 Inadequate
85 B234 2015-07-31 Inadequate
86 B234 2015-08-31 Inadequate
87 B234 2015-09-30 Inadequate
88 B234 2015-10-31 Inadequate
89 B234 2015-11-30 Inadequate
90 B234 2015-12-31 Inadequate
91 B234 2016-01-31 Inadequate
92 B234 2016-02-29 Inadequate
93 B234 2016-03-31 Inadequate
94 B234 2016-04-30 Inadequate
95 B234 2016-05-31 Inadequate
96 B234 2016-06-30 Inadequate
97 B234 2016-07-31 Inadequate
98 B234 2016-08-31 Inadequate
99 B234 2016-09-30 Inadequate
100 B234 2016-10-31 Inadequate
101 B234 2016-11-30 Inadequate
102 B234 2016-12-31 Inadequate
103 B234 2017-01-31 Inadequate
104 B234 2017-02-28 Inadequate
105 B234 2017-03-31 Inadequate
106 B234 2017-04-30 Inadequate
107 B234 2017-05-31 Inadequate
108 B234 2017-06-30 Inadequate
109 B234 2017-07-31 Inadequate
110 B234 2017-08-31 Inadequate
111 B234 2017-09-30 Inadequate
112 B234 2017-10-31 Inadequate
113 B234 2017-11-30 Inadequate
114 B234 2017-12-31 Inadequate
115 B234 2018-01-31 Inadequate
116 B234 2018-02-28 Inadequate
117 B234 2018-03-31 Inadequate
118 B234 2018-04-30 Inadequate
119 B234 2018-05-31 Inadequate
120 B234 2018-06-30 Inadequate
121 B234 2018-07-31 Inadequate
122 B234 2018-08-31 Inadequate
123 B234 2018-09-30 Inadequate
124 B234 2018-10-31 Inadequate
125 B234 2018-11-30 Inadequate
126 B234 2018-12-31 Inadequate
127 B234 2019-01-31 Inadequate
128 B234 2019-02-28 Inadequate
129 B234 2019-03-31 Inadequate
130 B234 2019-04-30 Inadequate
131 B234 2019-05-31 Inadequate
132 B234 2019-06-30 Inadequate
133 B234 2019-07-31 Inadequate
134 B234 2019-08-31 Inadequate
135 B234 2019-09-30 Inadequate
136 B234 2019-10-31 Inadequate
137 B234 2019-11-30 Inadequate
138 B234 2019-12-31 Inadequate
139 B234 2020-01-31 Inadequate
140 B234 2020-02-29 Inadequate
141 B234 2020-03-31 Inadequate
142 B234 2020-04-30 Inadequate
143 B234 2020-05-31 Inadequate
144 B234 2020-06-30 Inadequate
145 B234 2020-07-31 Inadequate
146 B234 2020-08-31 Inadequate
147 B234 2020-09-30 Inadequate
148 B234 2020-10-31 Inadequate
149 B234 2020-11-30 Inadequate
150 B234 2020-12-31 Inadequate
151 B234 2021-01-31 Inadequate
152 B234 2021-02-28 Inadequate
153 B234 2021-03-31 Inadequate
154 B234 2021-04-30 Inadequate
155 B234 2021-05-31 Inadequate
156 B234 2021-06-30 Inadequate
157 C456 2015-01-31 OK
158 C456 2015-02-28 OK
159 C456 2015-03-31 OK
160 C456 2015-04-30 OK
161 C456 2015-05-31 OK
162 C456 2015-06-30 OK
163 C456 2015-07-31 Good
164 C456 2015-08-31 Good
165 C456 2015-09-30 Good
166 C456 2015-10-31 Good
167 C456 2015-11-30 Good
168 C456 2015-12-31 Good
169 C456 2016-01-31 Good
170 C456 2016-02-29 Good
171 C456 2016-03-31 Good
172 C456 2016-04-30 Good
173 C456 2016-05-31 Good
174 C456 2016-06-30 Good
175 C456 2016-07-31 Good
176 C456 2016-08-31 Good
177 C456 2016-09-30 Good
178 C456 2016-10-31 Good
179 C456 2016-11-30 Good
180 C456 2016-12-31 Good
181 C456 2017-01-31 Good
182 C456 2017-02-28 Good
183 C456 2017-03-31 Good
184 C456 2017-04-30 Good
185 C456 2017-05-31 Good
186 C456 2017-06-30 Good
187 C456 2017-07-31 Good
188 C456 2017-08-31 Good
189 C456 2017-09-30 Good
190 C456 2017-10-31 Good
191 C456 2017-11-30 Good
192 C456 2017-12-31 Good
193 C456 2018-01-31 Good
194 C456 2018-02-28 Good
195 C456 2018-03-31 Good
196 C456 2018-04-30 Good
197 C456 2018-05-31 Good
198 C456 2018-06-30 OK
199 C456 2018-07-31 OK
200 C456 2018-08-31 OK
201 C456 2018-09-30 OK
202 C456 2018-10-31 OK
203 C456 2018-11-30 OK
204 C456 2018-12-31 OK
205 C456 2019-01-31 OK
206 C456 2019-02-28 OK
207 C456 2019-03-31 OK
208 C456 2019-04-30 OK
209 C456 2019-05-31 OK
210 C456 2019-06-30 OK
211 C456 2019-07-31 OK
212 C456 2019-08-31 OK
213 C456 2019-09-30 OK
214 C456 2019-10-31 OK
215 C456 2019-11-30 OK
216 C456 2019-12-31 OK
217 C456 2020-01-31 OK
218 C456 2020-02-29 OK
219 C456 2020-03-31 OK
220 C456 2020-04-30 OK
221 C456 2020-05-31 OK
222 C456 2020-06-30 OK
223 C456 2020-07-31 OK
224 C456 2020-08-31 OK
225 C456 2020-09-30 OK
226 C456 2020-10-31 OK
227 C456 2020-11-30 OK
228 C456 2020-12-31 OK
229 C456 2021-01-31 OK
230 C456 2021-02-28 OK
231 C456 2021-03-31 OK
232 C456 2021-04-30 OK
233 C456 2021-05-31 OK
234 C456 2021-06-30 OK
##数据 这是问题中显示的所有转换后的数据。
dataset1 <- structure(list(Organisation = c("A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "A123", "A123", "A123", "A123", "A123", "A123",
"A123", "A123", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"B234", "B234", "B234", "B234", "B234", "B234", "B234", "B234",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456", "C456", "C456",
"C456", "C456", "C456", "C456", "C456", "C456"), Date = structure(c(16466,
16494, 16525, 16555, 16586, 16616, 16647, 16678, 16708, 16739,
16769, 16800, 16831, 16860, 16891, 16921, 16952, 16982, 17013,
17044, 17074, 17105, 17135, 17166, 17197, 17225, 17256, 17286,
17317, 17347, 17378, 17409, 17439, 17470, 17500, 17531, 17562,
17590, 17621, 17651, 17682, 17712, 17743, 17774, 17804, 17835,
17865, 17896, 17927, 17955, 17986, 18016, 18047, 18077, 18108,
18139, 18169, 18200, 18230, 18261, 18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 16466, 16494, 16525, 16555,
16586, 16616, 16647, 16678, 16708, 16739, 16769, 16800, 16831,
16860, 16891, 16921, 16952, 16982, 17013, 17044, 17074, 17105,
17135, 17166, 17197, 17225, 17256, 17286, 17317, 17347, 17378,
17409, 17439, 17470, 17500, 17531, 17562, 17590, 17621, 17651,
17682, 17712, 17743, 17774, 17804, 17835, 17865, 17896, 17927,
17955, 17986, 18016, 18047, 18077, 18108, 18139, 18169, 18200,
18230, 18261, 18292, 18321, 18352, 18382, 18413, 18443, 18474,
18505, 18535, 18566, 18596, 18627, 18658, 18686, 18717, 18747,
18778, 18808, 16466, 16494, 16525, 16555, 16586, 16616, 16647,
16678, 16708, 16739, 16769, 16800, 16831, 16860, 16891, 16921,
16952, 16982, 17013, 17044, 17074, 17105, 17135, 17166, 17197,
17225, 17256, 17286, 17317, 17347, 17378, 17409, 17439, 17470,
17500, 17531, 17562, 17590, 17621, 17651, 17682, 17712, 17743,
17774, 17804, 17835, 17865, 17896, 17927, 17955, 17986, 18016,
18047, 18077, 18108, 18139, 18169, 18200, 18230, 18261, 18292,
18321, 18352, 18382, 18413, 18443, 18474, 18505, 18535, 18566,
18596, 18627, 18658, 18686, 18717, 18747, 18778, 18808), class = "Date")), out.attrs = list(
dim = c(Organisation = 3L, Date = 78L), dimnames = list(Organisation = c("Organisation=A123",
"Organisation=B234", "Organisation=C456"), Date = c("Date=2015-01-31",
"Date=2015-02-28", "Date=2015-03-31", "Date=2015-04-30",
"Date=2015-05-31", "Date=2015-06-30", "Date=2015-07-31",
"Date=2015-08-31", "Date=2015-09-30", "Date=2015-10-31",
"Date=2015-11-30", "Date=2015-12-31", "Date=2016-01-31",
"Date=2016-02-29", "Date=2016-03-31", "Date=2016-04-30",
"Date=2016-05-31", "Date=2016-06-30", "Date=2016-07-31",
"Date=2016-08-31", "Date=2016-09-30", "Date=2016-10-31",
"Date=2016-11-30", "Date=2016-12-31", "Date=2017-01-31",
"Date=2017-02-28", "Date=2017-03-31", "Date=2017-04-30",
"Date=2017-05-31", "Date=2017-06-30", "Date=2017-07-31",
"Date=2017-08-31", "Date=2017-09-30", "Date=2017-10-31",
"Date=2017-11-30", "Date=2017-12-31", "Date=2018-01-31",
"Date=2018-02-28", "Date=2018-03-31", "Date=2018-04-30",
"Date=2018-05-31", "Date=2018-06-30", "Date=2018-07-31",
"Date=2018-08-31", "Date=2018-09-30", "Date=2018-10-31",
"Date=2018-11-30", "Date=2018-12-31", "Date=2019-01-31",
"Date=2019-02-28", "Date=2019-03-31", "Date=2019-04-30",
"Date=2019-05-31", "Date=2019-06-30", "Date=2019-07-31",
"Date=2019-08-31", "Date=2019-09-30", "Date=2019-10-31",
"Date=2019-11-30", "Date=2019-12-31", "Date=2020-01-31",
"Date=2020-02-29", "Date=2020-03-31", "Date=2020-04-30",
"Date=2020-05-31", "Date=2020-06-30", "Date=2020-07-31",
"Date=2020-08-31", "Date=2020-09-30", "Date=2020-10-31",
"Date=2020-11-30", "Date=2020-12-31", "Date=2021-01-31",
"Date=2021-02-28", "Date=2021-03-31", "Date=2021-04-30",
"Date=2021-05-31", "Date=2021-06-30"))), row.names = c(NA,
-234L), class = "data.frame")
dataset2 <- structure(list(Organisation = c("A123", "A123", "B234", "C456",
"C456", "C456", "C456"), Date_inspection = structure(c(16466,
16814, 17331, 16616, 16841, 17669, 18342), class = "Date"), Performance = c("Good",
"OK", "Inadequate", "OK", "Inspected but not rated", "Good",
"OK")), row.names = c(NA, -7L), class = "data.frame")
我相信这可以简化,但会按预期工作:
# Recode Inspected but not rated to an NA of type
# character: clean_df2 => data.frame
clean_df2 <- transform(
with(
dataset2,
dataset2[
rev(
order(
Date_inspection
)
),
]
),
Performance = gsub(
"Inspected but not rated",
NA_character_,
Performance
)
)
# Expand the "dataset2" to months which the ratings
# are considered applicable over:
# inspectionsApplicable => data.frame
inspectionsApplicable <- unique(
data.frame(
do.call(
rbind,
lapply(
with(
clean_df2,
split(
clean_df2,
Organisation
)
),
function(x){
x$Month_inspected <- as.Date(
strftime(
x$Date_inspection,
"%Y-%m-01"
)
)
MaxMonthInData <- as.Date(
strftime(
max(
dataset1$Date[
dataset1$Organisation ==
unique(x$Organisation)
]
),
"%Y-%m-01"
)
)
data.frame(
Organisation = c(
x$Organisation[1],
x$Organisation
),
Months = c(
as.Date(MaxMonthInData),
as.Date(x$Month_inspected, "%Y-%m-%d")
),
Performance = c(
x$Performance[
which.min(
cumsum(
!(
is.na(
x$Performance
)
)
)
)
],
x$Performance
)
)
}
)
),
row.names = NULL
)
)
# Left join the tables, dropping dupes from
# from inspection result data.frame: ir_res => data.frame
ir_res <- merge(
transform(
with(
dataset1,
dataset1[
rev(
order(
Organisation,
Date
)
),
]
),
Months = as.Date(
strftime(
Date,
"%Y-%m-01"
)
)
),
with(
inspectionsApplicable,
inspectionsApplicable[
!(
duplicated(
paste0(
Organisation,
Months
),
fromLast = TRUE
)
),
]
),
by = c(
"Organisation",
"Months"
),
all.x = TRUE
)
# Back fill by group: res_ir2 => data.frame
res_ir2 <- do.call(
rbind,
lapply(
with(
ir_res,
split(
ir_res,
Organisation
)
),
function(x){
y <- with(
x,
x[
rev(
order(
Date
)
),
]
)
transform(
y,
Performance = na.omit(
Performance
)[
cumsum(
!(
is.na(
Performance
)
)
)
]
)
}
)
)
# Order by date and organisation: res => data.frame
res <- data.frame(
with(
res_ir2,
res_ir2[
order(
Organisation,
Date
),
]
),
row.names = NULL
)