用 NAs 按时间和变量 ID 合并两个数据帧
Merge two dataframes by time and variable ID with NAs
我目前正在使用 R 中的一些货币并希望合并或覆盖两个数据集以创建一个。
对于所有货币,我在 DF1
中有 1980 年到 2017 年的数据。对于其中的 16 个,我还拥有从 1970 年到 1975 年到 2017 年不同地点的数据 DF2
。我想要做的是将 DF2
中的 1970-1980 部分放在 DF1
之上。我想如果我设法将 DF1
合并到 DF2
中,它们都有值(因此 DF1
覆盖 DF2
),我想我会得到相同的结果。但是,并非所有货币的开始日期都完全相同,所以我不能硬编码。
下面是一个向您展示其外观的示例:
Date
为时间变量(月度数据)。 DF1
对应 1980-2017 年的数据,DF2
对应 1970-2017 年的数据。我的 objective 是根据列 ID 和行 ID 将 DF1
中的值覆盖到 DF2
中。 DF3
是所需的输出,没有来自 DF1 的 NA,只有来自 DF2 的值。
set.seed(1234)
DF1=data.frame(matrix(data=c(c(4:9),rnorm(30)),6,6))
set.seed(4321)
DF2=data.frame(matrix(data=c(c(1:12),rnorm(36)),12,4))
names(DF1)=c("Date","Currency1","Currency3","Currency6","Currency7","Currency8")
names(DF2)=c("Date","Currency1","Currency2","Currency3")
DF1$Currency3[1:2]=NA
DF1$Currency1[4:5]=NA
> DF1
Date Currency1 Currency3 Currency6 Currency7 Currency8
1 4 -1.2070657 NA -0.77625389 -0.8371717 -0.6937202
2 5 0.2774292 NA 0.06445882 2.4158352 -1.4482049
3 6 1.0844412 -0.5644520 0.95949406 0.1340882 0.5747557
4 7 NA -0.8900378 -0.11028549 -0.4906859 -1.0236557
5 8 NA -0.4771927 -0.51100951 -0.4405479 -0.0151383
6 9 0.5060559 -0.9983864 -0.91119542 0.4595894 -0.9359486
> DF2
Date Currency1 Currency2 Currency3
1 1 -0.42675738 -1.260985237 -0.09920208
2 2 -0.22361182 1.139464085 -0.23803425
3 3 0.71760679 -1.221781923 0.04778266
4 4 0.84144567 1.573315888 0.29651274
5 5 -0.12835727 0.073477874 -0.83380992
6 6 1.60934721 -1.175115087 -1.37397000
7 7 -0.29716745 -1.588261899 0.14027895
8 8 0.19600465 -0.747380729 0.66212596
9 9 1.24074620 0.483521864 1.13103967
10 10 -0.71869815 -0.003025539 -0.47511202
11 11 -0.06723632 -0.008930402 0.85241411
12 12 0.34436710 0.593357619 -0.75151885
我从一个用户那里得到了这个答案,但我遇到了一个新问题,DF1
中的一些数据中有 NA
s,这段代码将其覆盖到 DF2
.
library(data.table)
DF3 <- copy(DF2)
nm1 <- names(DF1)[-1]
setDT(DF3)[DF1, (nm1) := mget(paste0("i.", nm1)), on = .(Date)]
> DF3
Date Currency1 Currency2 Currency3 Currency6 Currency7 Currency8
1: 1 1.1022975 -0.8553646 -0.162309524 NA NA NA
2: 2 -0.4755931 -0.2806230 0.563055819 NA NA NA
3: 3 -0.7094400 -0.9943401 1.647817473 NA NA NA
4: 4 -1.2070657 -0.9685143 NA -0.77625389 -0.8371717 -0.6937202
5: 5 0.2774292 -1.1073182 NA 0.06445882 2.4158352 -1.4482049
6: 6 1.0844412 -1.2519859 -0.564451999 0.95949406 0.1340882 0.5747557
7: 7 NA -0.5238281 -0.890037829 -0.11028549 -0.4906859 -1.0236557
8: 8 NA -0.4968500 -0.477192700 -0.51100951 -0.4405479 -0.0151383
9: 9 0.5060559 -1.8060313 -0.998386445 -0.91119542 0.4595894 -0.9359486
10: 10 -0.4658975 -0.5820759 -0.669633580 NA NA NA
11: 11 1.4494963 -1.1088896 -0.007604756 NA NA NA
12: 12 -1.0686427 -1.0149620 1.777084448 NA NA NA
要仅替换 DF3
中在 DF1
中具有非缺失对应值的值,您可以使用 ifelse
.
使用:
DF3 <- copy(DF2)
nm1 <- names(DF1)[-1]
nm2 <- names(DF2)
setDT(DF3)[DF1, (nm1) := {n <- seq_along(nm1);
lapply(n, function(i) ifelse(is.na(get(paste0("i.", nm1[i]))) & nm1[i] %in% nm2,
get(paste0("x.", nm1[i])),
get(paste0("i.", nm1[i]))))},
on = .(Date)]
你得到:
> DF3
Date Currency1 Currency2 Currency3 Currency6 Currency7 Currency8
1: 1 -0.42675738 -1.260985237 -0.09920208 NA NA NA
2: 2 -0.22361182 1.139464085 -0.23803425 NA NA NA
3: 3 0.71760679 -1.221781923 0.04778266 NA NA NA
4: 4 -1.20706575 1.573315888 0.29651274 -0.77625389 -0.8371717 -0.6937202
5: 5 0.27742924 0.073477874 -0.83380992 0.06445882 2.4158352 -1.4482049
6: 6 1.08444118 -1.175115087 -0.56445200 0.95949406 0.1340882 0.5747557
7: 7 -0.29716745 -1.588261899 -0.89003783 -0.11028549 -0.4906859 -1.0236557
8: 8 0.19600465 -0.747380729 -0.47719270 -0.51100951 -0.4405479 -0.0151383
9: 9 0.50605589 0.483521864 -0.99838644 -0.91119542 NA -0.9359486
10: 10 -0.71869815 -0.003025539 -0.47511202 NA NA NA
11: 11 -0.06723632 -0.008930402 0.85241411 NA NA NA
12: 12 0.34436710 0.593357619 -0.75151885 NA NA NA
已用数据:
set.seed(1234)
DF1=data.frame(matrix(data=c(c(4:9),rnorm(30)),6,6))
set.seed(4321)
DF2=data.frame(matrix(data=c(c(1:12),rnorm(36)),12,4))
names(DF1)=c("Date","Currency1","Currency3","Currency6","Currency7","Currency8")
names(DF2)=c("Date","Currency1","Currency2","Currency3")
DF1$Currency3[1:2]=NA
DF1$Currency1[4:5]=NA
DF1$Currency7[nrow(DF1)]=NA
1) sqldf 使用 coalesce
执行 DF1
和 DF2
的左连接,以挑选出第一个非缺失的参数:
library(sqldf)
sqldf("select Date,
coalesce(DF1.Currency1, DF2.Currency1) Currency1,
DF2.Currency2,
coalesce(DF1.Currency3, DF2.Currency3) Currency3
from DF2 left join DF1 using (Date)")
给予:
Date Currency1 Currency2 Currency3
1 1 1.12493092 -0.15579551 1.1000254
2 2 -0.04493361 -1.47075238 0.7631757
3 3 -0.01619026 -0.47815006 -0.1645236
4 4 -0.83562860 0.41794156 -0.2533617
5 5 1.59528080 1.35867955 0.6969634
6 6 0.32950780 -0.10278773 1.5117812
7 7 -0.82046840 0.38767161 0.3898432
8 8 0.48742910 -0.05380504 -0.6212406
9 9 0.73832470 -1.37705956 -2.2146999
10 10 -1.98935170 -0.41499456 0.7685329
11 11 0.61982575 -0.39428995 -0.1123462
12 12 -0.05612874 -0.05931340 0.8811077
名称可以这样参数化:
Date <- names(DF2)[1]
Currency1 <- names(DF2)[2]
Currency2 <- names(DF2)[3]
Currency3 <- names(DF2)[4]
fn$sqldf("select [$Date],
coalesce(DF1.[$Currency1], DF2.[$Currency1]) [$Currency1],
DF2.[$Currency2],
coalesce(DF1.[$Currency3], DF2.[$Currency3]) [$Currency3]
from DF2 left join DF1 using ([$Date])")
只有在任意名称包含特殊字符的情况下才需要方括号。如果已知情况并非如此,则可以省略方括号。
另一种参数化方法是将数据框的列重命名为标准名称,使用标准名称进行计算,然后恢复原始名称。这里 DF1
和 DF2
具有任意列名称,而 DF1x
和 DF2x
具有上面使用的标准名称。这种方法可以更普遍地工作(例如,它也可以与 (2) 中所示的 dplyr 代码一起工作)。
DF1x <- setNames(DF1, c("Date", "Currency1", "Currency2", "Currency3")
DF2x <- setNames(DF2, c("Date", "Currency1", "Currency3"))
s <- sqldf("select Date,
coalesce(DF1x.Currency1, DF2x.Currency1) Currency1,
DF2x.Currency2,
coalesce(DF1x.Currency3, DF2x.Currency3) Currency3
from DF2x left join DF1x using (Date)")
names(s) <- names(DF2)
2) dplyr dplyr 可以用基本相同的方式来做,甚至有一个 coalesce
函数:
library(dplyr)
DF2 %>%
left_join(DF1, by = "Date") %>%
transmute(Date,
Currency1 = coalesce(Currency1.y, Currency1.x),
Currency2,
Currency3 = coalesce(Currency3.y, Currency3.x))
给予:
Date Currency1 Currency2 Currency3
1 1 1.12493092 -0.15579551 1.1000254
2 2 -0.04493361 -1.47075238 0.7631757
3 3 -0.01619026 -0.47815006 -0.1645236
4 4 -0.83562860 0.41794156 -0.2533617
5 5 1.59528080 1.35867955 0.6969634
6 6 0.32950780 -0.10278773 1.5117812
7 7 -0.82046840 0.38767161 0.3898432
8 8 0.48742910 -0.05380504 -0.6212406
9 9 0.73832470 -1.37705956 -2.2146999
10 10 -1.98935170 -0.41499456 0.7685329
11 11 0.61982575 -0.39428995 -0.1123462
12 12 -0.05612874 -0.05931340 0.8811077
注意: 定义问题输入的原始代码没有使用 set.seed
使其不可重现,因此我们使用了以下对应于最初显示的输入在问题中。 (此后问题已修复添加set.seed
。)
DF1 <-
structure(list(Date = 4:9, Currency1 = c(-0.8356286, 1.5952808,
0.3295078, -0.8204684, 0.4874291, 0.7383247), Currency3 = c(NA,
NA, 1.5117812, 0.3898432, -0.6212406, -2.2146999)), .Names = c("Date",
"Currency1", "Currency3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
DF2 <-
structure(list(Date = 1:12, Currency1 = c(1.12493092, -0.04493361,
-0.01619026, 0.94383621, 0.8212212, 0.59390132, 0.91897737, 0.7821363,
0.07456498, -1.9893517, 0.61982575, -0.05612874), Currency2 = c(-0.15579551,
-1.47075238, -0.47815006, 0.41794156, 1.35867955, -0.10278773,
0.38767161, -0.05380504, -1.37705956, -0.41499456, -0.39428995,
-0.0593134), Currency3 = c(1.1000254, 0.7631757, -0.1645236,
-0.2533617, 0.6969634, 0.5566632, -0.6887557, -0.7074952, 0.364582,
0.7685329, -0.1123462, 0.8811077)), .Names = c("Date", "Currency1",
"Currency2", "Currency3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
我目前正在使用 R 中的一些货币并希望合并或覆盖两个数据集以创建一个。
对于所有货币,我在 DF1
中有 1980 年到 2017 年的数据。对于其中的 16 个,我还拥有从 1970 年到 1975 年到 2017 年不同地点的数据 DF2
。我想要做的是将 DF2
中的 1970-1980 部分放在 DF1
之上。我想如果我设法将 DF1
合并到 DF2
中,它们都有值(因此 DF1
覆盖 DF2
),我想我会得到相同的结果。但是,并非所有货币的开始日期都完全相同,所以我不能硬编码。
下面是一个向您展示其外观的示例:
Date
为时间变量(月度数据)。 DF1
对应 1980-2017 年的数据,DF2
对应 1970-2017 年的数据。我的 objective 是根据列 ID 和行 ID 将 DF1
中的值覆盖到 DF2
中。 DF3
是所需的输出,没有来自 DF1 的 NA,只有来自 DF2 的值。
set.seed(1234)
DF1=data.frame(matrix(data=c(c(4:9),rnorm(30)),6,6))
set.seed(4321)
DF2=data.frame(matrix(data=c(c(1:12),rnorm(36)),12,4))
names(DF1)=c("Date","Currency1","Currency3","Currency6","Currency7","Currency8")
names(DF2)=c("Date","Currency1","Currency2","Currency3")
DF1$Currency3[1:2]=NA
DF1$Currency1[4:5]=NA
> DF1
Date Currency1 Currency3 Currency6 Currency7 Currency8
1 4 -1.2070657 NA -0.77625389 -0.8371717 -0.6937202
2 5 0.2774292 NA 0.06445882 2.4158352 -1.4482049
3 6 1.0844412 -0.5644520 0.95949406 0.1340882 0.5747557
4 7 NA -0.8900378 -0.11028549 -0.4906859 -1.0236557
5 8 NA -0.4771927 -0.51100951 -0.4405479 -0.0151383
6 9 0.5060559 -0.9983864 -0.91119542 0.4595894 -0.9359486
> DF2
Date Currency1 Currency2 Currency3
1 1 -0.42675738 -1.260985237 -0.09920208
2 2 -0.22361182 1.139464085 -0.23803425
3 3 0.71760679 -1.221781923 0.04778266
4 4 0.84144567 1.573315888 0.29651274
5 5 -0.12835727 0.073477874 -0.83380992
6 6 1.60934721 -1.175115087 -1.37397000
7 7 -0.29716745 -1.588261899 0.14027895
8 8 0.19600465 -0.747380729 0.66212596
9 9 1.24074620 0.483521864 1.13103967
10 10 -0.71869815 -0.003025539 -0.47511202
11 11 -0.06723632 -0.008930402 0.85241411
12 12 0.34436710 0.593357619 -0.75151885
我从一个用户那里得到了这个答案,但我遇到了一个新问题,DF1
中的一些数据中有 NA
s,这段代码将其覆盖到 DF2
.
library(data.table)
DF3 <- copy(DF2)
nm1 <- names(DF1)[-1]
setDT(DF3)[DF1, (nm1) := mget(paste0("i.", nm1)), on = .(Date)]
> DF3
Date Currency1 Currency2 Currency3 Currency6 Currency7 Currency8
1: 1 1.1022975 -0.8553646 -0.162309524 NA NA NA
2: 2 -0.4755931 -0.2806230 0.563055819 NA NA NA
3: 3 -0.7094400 -0.9943401 1.647817473 NA NA NA
4: 4 -1.2070657 -0.9685143 NA -0.77625389 -0.8371717 -0.6937202
5: 5 0.2774292 -1.1073182 NA 0.06445882 2.4158352 -1.4482049
6: 6 1.0844412 -1.2519859 -0.564451999 0.95949406 0.1340882 0.5747557
7: 7 NA -0.5238281 -0.890037829 -0.11028549 -0.4906859 -1.0236557
8: 8 NA -0.4968500 -0.477192700 -0.51100951 -0.4405479 -0.0151383
9: 9 0.5060559 -1.8060313 -0.998386445 -0.91119542 0.4595894 -0.9359486
10: 10 -0.4658975 -0.5820759 -0.669633580 NA NA NA
11: 11 1.4494963 -1.1088896 -0.007604756 NA NA NA
12: 12 -1.0686427 -1.0149620 1.777084448 NA NA NA
要仅替换 DF3
中在 DF1
中具有非缺失对应值的值,您可以使用 ifelse
.
使用:
DF3 <- copy(DF2)
nm1 <- names(DF1)[-1]
nm2 <- names(DF2)
setDT(DF3)[DF1, (nm1) := {n <- seq_along(nm1);
lapply(n, function(i) ifelse(is.na(get(paste0("i.", nm1[i]))) & nm1[i] %in% nm2,
get(paste0("x.", nm1[i])),
get(paste0("i.", nm1[i]))))},
on = .(Date)]
你得到:
> DF3
Date Currency1 Currency2 Currency3 Currency6 Currency7 Currency8
1: 1 -0.42675738 -1.260985237 -0.09920208 NA NA NA
2: 2 -0.22361182 1.139464085 -0.23803425 NA NA NA
3: 3 0.71760679 -1.221781923 0.04778266 NA NA NA
4: 4 -1.20706575 1.573315888 0.29651274 -0.77625389 -0.8371717 -0.6937202
5: 5 0.27742924 0.073477874 -0.83380992 0.06445882 2.4158352 -1.4482049
6: 6 1.08444118 -1.175115087 -0.56445200 0.95949406 0.1340882 0.5747557
7: 7 -0.29716745 -1.588261899 -0.89003783 -0.11028549 -0.4906859 -1.0236557
8: 8 0.19600465 -0.747380729 -0.47719270 -0.51100951 -0.4405479 -0.0151383
9: 9 0.50605589 0.483521864 -0.99838644 -0.91119542 NA -0.9359486
10: 10 -0.71869815 -0.003025539 -0.47511202 NA NA NA
11: 11 -0.06723632 -0.008930402 0.85241411 NA NA NA
12: 12 0.34436710 0.593357619 -0.75151885 NA NA NA
已用数据:
set.seed(1234)
DF1=data.frame(matrix(data=c(c(4:9),rnorm(30)),6,6))
set.seed(4321)
DF2=data.frame(matrix(data=c(c(1:12),rnorm(36)),12,4))
names(DF1)=c("Date","Currency1","Currency3","Currency6","Currency7","Currency8")
names(DF2)=c("Date","Currency1","Currency2","Currency3")
DF1$Currency3[1:2]=NA
DF1$Currency1[4:5]=NA
DF1$Currency7[nrow(DF1)]=NA
1) sqldf 使用 coalesce
执行 DF1
和 DF2
的左连接,以挑选出第一个非缺失的参数:
library(sqldf)
sqldf("select Date,
coalesce(DF1.Currency1, DF2.Currency1) Currency1,
DF2.Currency2,
coalesce(DF1.Currency3, DF2.Currency3) Currency3
from DF2 left join DF1 using (Date)")
给予:
Date Currency1 Currency2 Currency3
1 1 1.12493092 -0.15579551 1.1000254
2 2 -0.04493361 -1.47075238 0.7631757
3 3 -0.01619026 -0.47815006 -0.1645236
4 4 -0.83562860 0.41794156 -0.2533617
5 5 1.59528080 1.35867955 0.6969634
6 6 0.32950780 -0.10278773 1.5117812
7 7 -0.82046840 0.38767161 0.3898432
8 8 0.48742910 -0.05380504 -0.6212406
9 9 0.73832470 -1.37705956 -2.2146999
10 10 -1.98935170 -0.41499456 0.7685329
11 11 0.61982575 -0.39428995 -0.1123462
12 12 -0.05612874 -0.05931340 0.8811077
名称可以这样参数化:
Date <- names(DF2)[1]
Currency1 <- names(DF2)[2]
Currency2 <- names(DF2)[3]
Currency3 <- names(DF2)[4]
fn$sqldf("select [$Date],
coalesce(DF1.[$Currency1], DF2.[$Currency1]) [$Currency1],
DF2.[$Currency2],
coalesce(DF1.[$Currency3], DF2.[$Currency3]) [$Currency3]
from DF2 left join DF1 using ([$Date])")
只有在任意名称包含特殊字符的情况下才需要方括号。如果已知情况并非如此,则可以省略方括号。
另一种参数化方法是将数据框的列重命名为标准名称,使用标准名称进行计算,然后恢复原始名称。这里 DF1
和 DF2
具有任意列名称,而 DF1x
和 DF2x
具有上面使用的标准名称。这种方法可以更普遍地工作(例如,它也可以与 (2) 中所示的 dplyr 代码一起工作)。
DF1x <- setNames(DF1, c("Date", "Currency1", "Currency2", "Currency3")
DF2x <- setNames(DF2, c("Date", "Currency1", "Currency3"))
s <- sqldf("select Date,
coalesce(DF1x.Currency1, DF2x.Currency1) Currency1,
DF2x.Currency2,
coalesce(DF1x.Currency3, DF2x.Currency3) Currency3
from DF2x left join DF1x using (Date)")
names(s) <- names(DF2)
2) dplyr dplyr 可以用基本相同的方式来做,甚至有一个 coalesce
函数:
library(dplyr)
DF2 %>%
left_join(DF1, by = "Date") %>%
transmute(Date,
Currency1 = coalesce(Currency1.y, Currency1.x),
Currency2,
Currency3 = coalesce(Currency3.y, Currency3.x))
给予:
Date Currency1 Currency2 Currency3
1 1 1.12493092 -0.15579551 1.1000254
2 2 -0.04493361 -1.47075238 0.7631757
3 3 -0.01619026 -0.47815006 -0.1645236
4 4 -0.83562860 0.41794156 -0.2533617
5 5 1.59528080 1.35867955 0.6969634
6 6 0.32950780 -0.10278773 1.5117812
7 7 -0.82046840 0.38767161 0.3898432
8 8 0.48742910 -0.05380504 -0.6212406
9 9 0.73832470 -1.37705956 -2.2146999
10 10 -1.98935170 -0.41499456 0.7685329
11 11 0.61982575 -0.39428995 -0.1123462
12 12 -0.05612874 -0.05931340 0.8811077
注意: 定义问题输入的原始代码没有使用 set.seed
使其不可重现,因此我们使用了以下对应于最初显示的输入在问题中。 (此后问题已修复添加set.seed
。)
DF1 <-
structure(list(Date = 4:9, Currency1 = c(-0.8356286, 1.5952808,
0.3295078, -0.8204684, 0.4874291, 0.7383247), Currency3 = c(NA,
NA, 1.5117812, 0.3898432, -0.6212406, -2.2146999)), .Names = c("Date",
"Currency1", "Currency3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
DF2 <-
structure(list(Date = 1:12, Currency1 = c(1.12493092, -0.04493361,
-0.01619026, 0.94383621, 0.8212212, 0.59390132, 0.91897737, 0.7821363,
0.07456498, -1.9893517, 0.61982575, -0.05612874), Currency2 = c(-0.15579551,
-1.47075238, -0.47815006, 0.41794156, 1.35867955, -0.10278773,
0.38767161, -0.05380504, -1.37705956, -0.41499456, -0.39428995,
-0.0593134), Currency3 = c(1.1000254, 0.7631757, -0.1645236,
-0.2533617, 0.6969634, 0.5566632, -0.6887557, -0.7074952, 0.364582,
0.7685329, -0.1123462, 0.8811077)), .Names = c("Date", "Currency1",
"Currency2", "Currency3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))