如果信息在另一行中可用,则填充列
Filling column if information is available in another row
我有如下数据:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
对于第 7 到 10 行,我从来源 1 知道邮政编码是什么。从来源 2 我知道这个邮政编码
属于第 4 类。执行此操作的最佳方法是什么?
期望的输出:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
我们可以使用fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
或使用nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-输出
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
我更愿意创建新的列来执行此操作,我将其称为 zip
和 category
,但如果需要,可以直接覆盖原始列。
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4
我有如下数据:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
对于第 7 到 10 行,我从来源 1 知道邮政编码是什么。从来源 2 我知道这个邮政编码 属于第 4 类。执行此操作的最佳方法是什么?
期望的输出:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
我们可以使用fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
或使用nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-输出
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
我更愿意创建新的列来执行此操作,我将其称为 zip
和 category
,但如果需要,可以直接覆盖原始列。
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4