通过索引号根据另一个数据框中的多个值向一个数据框中添加一列
Add a column to one data frame based on multiple values in another by index number
我有两个非常大的数据框。第一个数据框有一个县名列表及其相关的 fip 代码。第二个数据集只有他们的fip代码。
我想在第二个数据框中添加两列及其相关的县名。
假设这是 df1
df1 = data.frame(countyname = c("Archuleta County, CO","Baca County, CO","Cheyenne County, CO","Kiowa County, CO","Cimarron County, OK","Rio Arriba County, NM","Conejos County, CO"),
fipscounty = c(8007,8009,8017,8061,35039,40025,8021))
countyname fipscounty
1 Archuleta County, CO 8007
2 Baca County, CO 8009
3 Cheyenne County, CO 8017
4 Kiowa County, CO 8061
5 Cimarron County, OK 35039
6 Rio Arriba County, NM 40025
7 Conejos County, CO 8021
编辑:这是 df2
df2 = data.frame(county1=c(8007,8007,8009,8017),
distance=c(4,3,2,1),
county2=c(35039,8021,40025,8061))
county1 distance county2
1 8007 4 35039
2 8007 3 8021
3 8009 2 40025
4 8017 1 8061
编辑:我希望最终结果如下所示:
countyname fipscounty distance countyneighbor fipscounty2
1 Archuleta County, CO 8007 4 Cimarron County, OK 35039
2 Archuleta County, CO 8007 3 Conejos County, CO 8021
3 Baca County, CO 8009 2 Rio Arriba County, NM 40025
4 Cheyenne County, CO 8017 1 Kiowa County, CO 8061
我想使用 df1 和 df2 的 fips 代码将县名从 df1 转移到 df2。因为它们没有相同的列名,所以我可能必须使用索引号来执行此操作。但是,我不想传输整行,否则我会有重复的 fips 列。
我试过了,当然出错了
df2 <- left_join(df1,df2, by= df1[2])
我该怎么做?
使用 match
.
m <- match(df2$county1, df1$fipscounty)
res <- cbind(df1[m, ], df1[match(df2$county2, df1$fipscounty), ])
names(res)[c(2, 4)] <- names(df2)[c(1, 3)]
res
# countyname county1 countyname county2
# 1 Archuleta County, CO 8007 Cimarron County, OK 35039
# 1.1 Archuleta County, CO 8007 Conejos County, CO 8021
# 2 Baca County, CO 8009 Rio Arriba County, NM 40025
# 3 Cheyenne County, CO 8017 Kiowa County, CO 8061
编辑
根据您的编辑,您可以将 merge
和 append
作为工具。
m1 <- merge(df1a, df2a, by.x='fipscounty', by.y='county1')[c(2, 1, 3:4)]
append(m1,
list(countyneighbor=df1a[match(m1$county2, df1a$fipscounty),
'countyname']), 3) |>
as.data.frame()
# countyname fipscounty distance countyneighbor county2
# 1 Archuleta County, CO 8007 4 Cimarron County, OK 35039
# 2 Archuleta County, CO 8007 3 Conejos County, CO 8021
# 3 Baca County, CO 8009 2 Rio Arriba County, NM 40025
# 4 Cheyenne County, CO 8017 1 Kiowa County, CO 8061
注意: R >= 4.1 使用。
数据:
df1 <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO",
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK",
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007,
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA,
-7L))
df2 <- structure(list(county1 = c(8007, 8007, 8009, 8017), county2 = c(35039,
8021, 40025, 8061)), class = "data.frame", row.names = c(NA,
-4L))
df1a <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO",
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK",
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007,
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA,
-7L))
df2a <- structure(list(county1 = c(8007, 8007, 8009, 8017), distance = c(4,
3, 2, 1), county2 = c(35039, 8021, 40025, 8061)), class = "data.frame", row.names = c(NA,
-4L))
我有两个非常大的数据框。第一个数据框有一个县名列表及其相关的 fip 代码。第二个数据集只有他们的fip代码。
我想在第二个数据框中添加两列及其相关的县名。
假设这是 df1
df1 = data.frame(countyname = c("Archuleta County, CO","Baca County, CO","Cheyenne County, CO","Kiowa County, CO","Cimarron County, OK","Rio Arriba County, NM","Conejos County, CO"),
fipscounty = c(8007,8009,8017,8061,35039,40025,8021))
countyname fipscounty
1 Archuleta County, CO 8007
2 Baca County, CO 8009
3 Cheyenne County, CO 8017
4 Kiowa County, CO 8061
5 Cimarron County, OK 35039
6 Rio Arriba County, NM 40025
7 Conejos County, CO 8021
编辑:这是 df2
df2 = data.frame(county1=c(8007,8007,8009,8017),
distance=c(4,3,2,1),
county2=c(35039,8021,40025,8061))
county1 distance county2
1 8007 4 35039
2 8007 3 8021
3 8009 2 40025
4 8017 1 8061
编辑:我希望最终结果如下所示:
countyname fipscounty distance countyneighbor fipscounty2
1 Archuleta County, CO 8007 4 Cimarron County, OK 35039
2 Archuleta County, CO 8007 3 Conejos County, CO 8021
3 Baca County, CO 8009 2 Rio Arriba County, NM 40025
4 Cheyenne County, CO 8017 1 Kiowa County, CO 8061
我想使用 df1 和 df2 的 fips 代码将县名从 df1 转移到 df2。因为它们没有相同的列名,所以我可能必须使用索引号来执行此操作。但是,我不想传输整行,否则我会有重复的 fips 列。
我试过了,当然出错了
df2 <- left_join(df1,df2, by= df1[2])
我该怎么做?
使用 match
.
m <- match(df2$county1, df1$fipscounty)
res <- cbind(df1[m, ], df1[match(df2$county2, df1$fipscounty), ])
names(res)[c(2, 4)] <- names(df2)[c(1, 3)]
res
# countyname county1 countyname county2
# 1 Archuleta County, CO 8007 Cimarron County, OK 35039
# 1.1 Archuleta County, CO 8007 Conejos County, CO 8021
# 2 Baca County, CO 8009 Rio Arriba County, NM 40025
# 3 Cheyenne County, CO 8017 Kiowa County, CO 8061
编辑
根据您的编辑,您可以将 merge
和 append
作为工具。
m1 <- merge(df1a, df2a, by.x='fipscounty', by.y='county1')[c(2, 1, 3:4)]
append(m1,
list(countyneighbor=df1a[match(m1$county2, df1a$fipscounty),
'countyname']), 3) |>
as.data.frame()
# countyname fipscounty distance countyneighbor county2
# 1 Archuleta County, CO 8007 4 Cimarron County, OK 35039
# 2 Archuleta County, CO 8007 3 Conejos County, CO 8021
# 3 Baca County, CO 8009 2 Rio Arriba County, NM 40025
# 4 Cheyenne County, CO 8017 1 Kiowa County, CO 8061
注意: R >= 4.1 使用。
数据:
df1 <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO",
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK",
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007,
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA,
-7L))
df2 <- structure(list(county1 = c(8007, 8007, 8009, 8017), county2 = c(35039,
8021, 40025, 8061)), class = "data.frame", row.names = c(NA,
-4L))
df1a <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO",
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK",
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007,
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA,
-7L))
df2a <- structure(list(county1 = c(8007, 8007, 8009, 8017), distance = c(4,
3, 2, 1), county2 = c(35039, 8021, 40025, 8061)), class = "data.frame", row.names = c(NA,
-4L))