在数据框中的不同日期之间添加日期点,并在其他列中创建空白(NA),这些新行是在 r 中创建的
Add date points between separate dates in a dataframe and create blanks (NA) in the other columns were those newly rows were created in r
我的数据是这样的:
> dput(head(h01_NDVI_specveg_data_spectra,6))
structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868,
0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
我有单独的日期,没有顺序,如您在示例中所见(例如:2011-04-12;2011-04-28;2011-05-31...)。我想要的是在我拥有的日期之间插入缺少的日期。因此,最重要的是,我想为其他列创建新行,对于 NDVI,这些行将是 NA
.
检查所需输出的示例:
ID
collection_date
NDVI
h01
2011-04-12
0.5817694
h01
2011-04-13
NA
h01
2011-04-14
NA
h01
2011-04-15
NA
h01
2011-04-16
NA
h01
2011-04-17
NA
h01
2011-04-18
NA
h01
2011-04-19
NA
h01
2011-04-20
NA
h01
2011-04-21
NA
h01
2011-04-22
NA
h01
2011-04-23
NA
h01
2011-04-24
NA
h01
2011-04-25
NA
h01
2011-04-26
NA
h01
2011-04-27
NA
h01
2011-04-28
0.5394456
h01
2011-04-29
NA
h01
2011-04-30
NA
...
..........
..
任何帮助将不胜感激。
df1 <- structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"),
collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"),
NDVI = c(0.581769436997319, 0.539445628997868, 0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155)),
row.names = c(NA, -6L), class = c("data.frame"))
我们创建一个 data.frame 包含 所有 日期和 tidyr::left_join
它与现有(不完整)数据。 NA 是自动创建的。
library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days")) %>%
left_join(df1) %>%
arrange(collection_date) %>%
select(ID, collection_date, everything())
Returns:
ID collection_date NDVI
1 h01 2011-04-12 0.5817694
2 <NA> 2011-04-13 NA
3 <NA> 2011-04-14 NA
4 <NA> 2011-04-15 NA
5 <NA> 2011-04-16 NA
6 <NA> 2011-04-17 NA
7 <NA> 2011-04-18 NA
8 <NA> 2011-04-19 NA
9 <NA> 2011-04-20 NA
10 <NA> 2011-04-21 NA
11 <NA> 2011-04-22 NA
12 <NA> 2011-04-23 NA
13 <NA> 2011-04-24 NA
14 <NA> 2011-04-25 NA
15 <NA> 2011-04-26 NA
16 <NA> 2011-04-27 NA
17 h01 2011-04-28 0.5394456
18 <NA> 2011-04-29 NA
19 <NA> 2011-04-30 NA
20 <NA> 2011-05-01 NA
21 <NA> 2011-05-02 NA
22 <NA> 2011-05-03 NA
23 <NA> 2011-05-04 NA
24 <NA> 2011-05-05 NA
25 <NA> 2011-05-06 NA
26 <NA> 2011-05-07 NA
27 <NA> 2011-05-08 NA
28 <NA> 2011-05-09 NA
29 <NA> 2011-05-10 NA
30 <NA> 2011-05-11 NA
31 <NA> 2011-05-12 NA
32 <NA> 2011-05-13 NA
33 <NA> 2011-05-14 NA
34 <NA> 2011-05-15 NA
35 <NA> 2011-05-16 NA
36 <NA> 2011-05-17 NA
37 <NA> 2011-05-18 NA
38 <NA> 2011-05-19 NA
39 <NA> 2011-05-20 NA
40 <NA> 2011-05-21 NA
41 <NA> 2011-05-22 NA
42 <NA> 2011-05-23 NA
43 <NA> 2011-05-24 NA
44 <NA> 2011-05-25 NA
45 <NA> 2011-05-26 NA
46 <NA> 2011-05-27 NA
47 <NA> 2011-05-28 NA
48 <NA> 2011-05-29 NA
49 <NA> 2011-05-30 NA
50 h01 2011-05-31 0.3385417
51 <NA> 2011-06-01 NA
52 <NA> 2011-06-02 NA
53 <NA> 2011-06-03 NA
54 <NA> 2011-06-04 NA
55 <NA> 2011-06-05 NA
56 <NA> 2011-06-06 NA
57 <NA> 2011-06-07 NA
58 <NA> 2011-06-08 NA
59 <NA> 2011-06-09 NA
60 <NA> 2011-06-10 NA
61 <NA> 2011-06-11 NA
62 <NA> 2011-06-12 NA
63 <NA> 2011-06-13 NA
64 h01 2011-06-14 0.3027140
65 <NA> 2011-06-15 NA
66 <NA> 2011-06-16 NA
67 <NA> 2011-06-17 NA
68 <NA> 2011-06-18 NA
69 <NA> 2011-06-19 NA
70 <NA> 2011-06-20 NA
71 <NA> 2011-06-21 NA
72 <NA> 2011-06-22 NA
73 <NA> 2011-06-23 NA
74 <NA> 2011-06-24 NA
75 <NA> 2011-06-25 NA
76 <NA> 2011-06-26 NA
77 <NA> 2011-06-27 NA
78 <NA> 2011-06-28 NA
79 <NA> 2011-06-29 NA
80 <NA> 2011-06-30 NA
81 <NA> 2011-07-01 NA
82 <NA> 2011-07-02 NA
83 <NA> 2011-07-03 NA
84 h01 2011-07-04 0.3058824
85 <NA> 2011-07-05 NA
86 <NA> 2011-07-06 NA
87 <NA> 2011-07-07 NA
88 <NA> 2011-07-08 NA
89 <NA> 2011-07-09 NA
90 <NA> 2011-07-10 NA
91 <NA> 2011-07-11 NA
92 <NA> 2011-07-12 NA
93 <NA> 2011-07-13 NA
94 <NA> 2011-07-14 NA
95 h01 2011-07-15 0.2694394
编辑:
为了让 ID = "h01" 无处不在,我们只需将其添加到构造的 data.frame 中即可。即:
library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days"),
ID = "h01") %>%
left_join(df1) %>%
arrange(collection_date) %>%
select(ID, collection_date, everything())
library(tidyverse)
library(lubridate)
df = structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868,
0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
df2 = tibble(
ID = "h01",
collection_date = seq(ymd("2011-04-10"), ymd("2011-07-16"), 1)
) %>% left_join(df, by = c("ID", "collection_date"))
df2 %>% head(10)
输出
# A tibble: 98 x 3
ID collection_date NDVI
<chr> <date> <dbl>
1 h01 2011-04-10 NA
2 h01 2011-04-11 NA
3 h01 2011-04-12 0.582
4 h01 2011-04-13 NA
5 h01 2011-04-14 NA
6 h01 2011-04-15 NA
7 h01 2011-04-16 NA
8 h01 2011-04-17 NA
9 h01 2011-04-18 NA
10 h01 2011-04-19 NA
# ... with 88 more rows
输出df2 %>% tail(10)
# A tibble: 10 x 3
ID collection_date NDVI
<chr> <date> <dbl>
1 h01 2011-07-07 NA
2 h01 2011-07-08 NA
3 h01 2011-07-09 NA
4 h01 2011-07-10 NA
5 h01 2011-07-11 NA
6 h01 2011-07-12 NA
7 h01 2011-07-13 NA
8 h01 2011-07-14 NA
9 h01 2011-07-15 0.269
10 h01 2011-07-16 NA
您可以使用 tidyr::complete
-
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
complete(collection_date = seq(min(collection_date),
max(collection_date), by = 'days')) %>%
ungroup
# ID collection_date NDVI
# <chr> <date> <dbl>
# 1 h01 2011-04-12 0.582
# 2 h01 2011-04-13 NA
# 3 h01 2011-04-14 NA
# 4 h01 2011-04-15 NA
# 5 h01 2011-04-16 NA
# 6 h01 2011-04-17 NA
# 7 h01 2011-04-18 NA
# 8 h01 2011-04-19 NA
# 9 h01 2011-04-20 NA
#10 h01 2011-04-21 NA
#11 h01 2011-04-22 NA
#12 h01 2011-04-23 NA
#13 h01 2011-04-24 NA
#14 h01 2011-04-25 NA
#15 h01 2011-04-26 NA
#16 h01 2011-04-27 NA
#17 h01 2011-04-28 0.539
#18 h01 2011-04-29 NA
#19 h01 2011-04-30 NA
#20 h01 2011-05-01 NA
#...
#...
这种方法的好处是它会根据 min
和 max
为每个 ID
.
创建缺失日期
我的数据是这样的:
> dput(head(h01_NDVI_specveg_data_spectra,6))
structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868,
0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
我有单独的日期,没有顺序,如您在示例中所见(例如:2011-04-12;2011-04-28;2011-05-31...)。我想要的是在我拥有的日期之间插入缺少的日期。因此,最重要的是,我想为其他列创建新行,对于 NDVI,这些行将是 NA
.
检查所需输出的示例:
ID | collection_date | NDVI |
---|---|---|
h01 | 2011-04-12 | 0.5817694 |
h01 | 2011-04-13 | NA |
h01 | 2011-04-14 | NA |
h01 | 2011-04-15 | NA |
h01 | 2011-04-16 | NA |
h01 | 2011-04-17 | NA |
h01 | 2011-04-18 | NA |
h01 | 2011-04-19 | NA |
h01 | 2011-04-20 | NA |
h01 | 2011-04-21 | NA |
h01 | 2011-04-22 | NA |
h01 | 2011-04-23 | NA |
h01 | 2011-04-24 | NA |
h01 | 2011-04-25 | NA |
h01 | 2011-04-26 | NA |
h01 | 2011-04-27 | NA |
h01 | 2011-04-28 | 0.5394456 |
h01 | 2011-04-29 | NA |
h01 | 2011-04-30 | NA |
... | .......... | .. |
任何帮助将不胜感激。
df1 <- structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"),
collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"),
NDVI = c(0.581769436997319, 0.539445628997868, 0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155)),
row.names = c(NA, -6L), class = c("data.frame"))
我们创建一个 data.frame 包含 所有 日期和 tidyr::left_join
它与现有(不完整)数据。 NA 是自动创建的。
library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days")) %>%
left_join(df1) %>%
arrange(collection_date) %>%
select(ID, collection_date, everything())
Returns:
ID collection_date NDVI 1 h01 2011-04-12 0.5817694 2 <NA> 2011-04-13 NA 3 <NA> 2011-04-14 NA 4 <NA> 2011-04-15 NA 5 <NA> 2011-04-16 NA 6 <NA> 2011-04-17 NA 7 <NA> 2011-04-18 NA 8 <NA> 2011-04-19 NA 9 <NA> 2011-04-20 NA 10 <NA> 2011-04-21 NA 11 <NA> 2011-04-22 NA 12 <NA> 2011-04-23 NA 13 <NA> 2011-04-24 NA 14 <NA> 2011-04-25 NA 15 <NA> 2011-04-26 NA 16 <NA> 2011-04-27 NA 17 h01 2011-04-28 0.5394456 18 <NA> 2011-04-29 NA 19 <NA> 2011-04-30 NA 20 <NA> 2011-05-01 NA 21 <NA> 2011-05-02 NA 22 <NA> 2011-05-03 NA 23 <NA> 2011-05-04 NA 24 <NA> 2011-05-05 NA 25 <NA> 2011-05-06 NA 26 <NA> 2011-05-07 NA 27 <NA> 2011-05-08 NA 28 <NA> 2011-05-09 NA 29 <NA> 2011-05-10 NA 30 <NA> 2011-05-11 NA 31 <NA> 2011-05-12 NA 32 <NA> 2011-05-13 NA 33 <NA> 2011-05-14 NA 34 <NA> 2011-05-15 NA 35 <NA> 2011-05-16 NA 36 <NA> 2011-05-17 NA 37 <NA> 2011-05-18 NA 38 <NA> 2011-05-19 NA 39 <NA> 2011-05-20 NA 40 <NA> 2011-05-21 NA 41 <NA> 2011-05-22 NA 42 <NA> 2011-05-23 NA 43 <NA> 2011-05-24 NA 44 <NA> 2011-05-25 NA 45 <NA> 2011-05-26 NA 46 <NA> 2011-05-27 NA 47 <NA> 2011-05-28 NA 48 <NA> 2011-05-29 NA 49 <NA> 2011-05-30 NA 50 h01 2011-05-31 0.3385417 51 <NA> 2011-06-01 NA 52 <NA> 2011-06-02 NA 53 <NA> 2011-06-03 NA 54 <NA> 2011-06-04 NA 55 <NA> 2011-06-05 NA 56 <NA> 2011-06-06 NA 57 <NA> 2011-06-07 NA 58 <NA> 2011-06-08 NA 59 <NA> 2011-06-09 NA 60 <NA> 2011-06-10 NA 61 <NA> 2011-06-11 NA 62 <NA> 2011-06-12 NA 63 <NA> 2011-06-13 NA 64 h01 2011-06-14 0.3027140 65 <NA> 2011-06-15 NA 66 <NA> 2011-06-16 NA 67 <NA> 2011-06-17 NA 68 <NA> 2011-06-18 NA 69 <NA> 2011-06-19 NA 70 <NA> 2011-06-20 NA 71 <NA> 2011-06-21 NA 72 <NA> 2011-06-22 NA 73 <NA> 2011-06-23 NA 74 <NA> 2011-06-24 NA 75 <NA> 2011-06-25 NA 76 <NA> 2011-06-26 NA 77 <NA> 2011-06-27 NA 78 <NA> 2011-06-28 NA 79 <NA> 2011-06-29 NA 80 <NA> 2011-06-30 NA 81 <NA> 2011-07-01 NA 82 <NA> 2011-07-02 NA 83 <NA> 2011-07-03 NA 84 h01 2011-07-04 0.3058824 85 <NA> 2011-07-05 NA 86 <NA> 2011-07-06 NA 87 <NA> 2011-07-07 NA 88 <NA> 2011-07-08 NA 89 <NA> 2011-07-09 NA 90 <NA> 2011-07-10 NA 91 <NA> 2011-07-11 NA 92 <NA> 2011-07-12 NA 93 <NA> 2011-07-13 NA 94 <NA> 2011-07-14 NA 95 h01 2011-07-15 0.2694394
编辑:
为了让 ID = "h01" 无处不在,我们只需将其添加到构造的 data.frame 中即可。即:
library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days"),
ID = "h01") %>%
left_join(df1) %>%
arrange(collection_date) %>%
select(ID, collection_date, everything())
library(tidyverse)
library(lubridate)
df = structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139,
15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868,
0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
df2 = tibble(
ID = "h01",
collection_date = seq(ymd("2011-04-10"), ymd("2011-07-16"), 1)
) %>% left_join(df, by = c("ID", "collection_date"))
df2 %>% head(10)
输出
# A tibble: 98 x 3
ID collection_date NDVI
<chr> <date> <dbl>
1 h01 2011-04-10 NA
2 h01 2011-04-11 NA
3 h01 2011-04-12 0.582
4 h01 2011-04-13 NA
5 h01 2011-04-14 NA
6 h01 2011-04-15 NA
7 h01 2011-04-16 NA
8 h01 2011-04-17 NA
9 h01 2011-04-18 NA
10 h01 2011-04-19 NA
# ... with 88 more rows
输出df2 %>% tail(10)
# A tibble: 10 x 3
ID collection_date NDVI
<chr> <date> <dbl>
1 h01 2011-07-07 NA
2 h01 2011-07-08 NA
3 h01 2011-07-09 NA
4 h01 2011-07-10 NA
5 h01 2011-07-11 NA
6 h01 2011-07-12 NA
7 h01 2011-07-13 NA
8 h01 2011-07-14 NA
9 h01 2011-07-15 0.269
10 h01 2011-07-16 NA
您可以使用 tidyr::complete
-
library(dplyr)
library(tidyr)
df %>%
group_by(ID) %>%
complete(collection_date = seq(min(collection_date),
max(collection_date), by = 'days')) %>%
ungroup
# ID collection_date NDVI
# <chr> <date> <dbl>
# 1 h01 2011-04-12 0.582
# 2 h01 2011-04-13 NA
# 3 h01 2011-04-14 NA
# 4 h01 2011-04-15 NA
# 5 h01 2011-04-16 NA
# 6 h01 2011-04-17 NA
# 7 h01 2011-04-18 NA
# 8 h01 2011-04-19 NA
# 9 h01 2011-04-20 NA
#10 h01 2011-04-21 NA
#11 h01 2011-04-22 NA
#12 h01 2011-04-23 NA
#13 h01 2011-04-24 NA
#14 h01 2011-04-25 NA
#15 h01 2011-04-26 NA
#16 h01 2011-04-27 NA
#17 h01 2011-04-28 0.539
#18 h01 2011-04-29 NA
#19 h01 2011-04-30 NA
#20 h01 2011-05-01 NA
#...
#...
这种方法的好处是它会根据 min
和 max
为每个 ID
.