创建一个新列,该列采用 R 中另一列的初始值
creating a new column that takes initial values of another column in R
我正在处理横截面数据集。我想创建一个名为“initial”的新列,它将包含另一列的初始值。进一步来说,
对于每个国家,初始列在数据可用的第一年采用另一列称为“比率”的值,其余所有年份的值都为 0。
示例代码:
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
我想要获得的输出如下所示:
有没有办法在 R 中最好使用 dplyr 包来做到这一点?
非常感谢!
这可能会有所改进,但我得到了您预期的输出:
library(dplyr)
mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
filter(year == min(year)) %>%
rename(initial = ratio) %>%
full_join(., mydata) %>%
mutate(initial = ifelse(is.na(initial), 0, initial)) %>%
arrange(country, year) %>%
relocate(initial, .after = last_col())
输出:
country year ratio initial
<chr> <int> <dbl> <dbl>
1 A 1980 NA 0
2 A 1981 0.341 0.341
3 A 1982 0.330 0
4 A 1983 0.219 0
5 A 1984 0.269 0
6 B 1980 NA 0
7 B 1981 NA 0
8 B 1982 0.365 0.365
9 B 1983 0.210 0
10 B 1984 0.334 0
11 C 1980 0.284 0.284
12 C 1981 0.251 0
13 C 1982 0.358 0
14 C 1983 0.288 0
15 C 1984 0.261 0
16 D 1980 NA 0
17 D 1981 NA 0
18 D 1982 NA 0
19 D 1983 0.252 0.252
20 D 1984 0.301 0
利用 dplyr::first
你可以做:
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = first(ratio[!is.na(ratio)]),
initial = ifelse(is.na(ratio) | ratio != initial, 0, initial)) %>%
ungroup()
#> # A tibble: 20 × 4
#> country year ratio initial
#> <chr> <int> <dbl> <dbl>
#> 1 A 1980 NA 0
#> 2 A 1981 0.387 0.387
#> 3 A 1982 0.257 0
#> 4 A 1983 0.366 0
#> 5 A 1984 0.328 0
#> 6 B 1980 NA 0
#> 7 B 1981 NA 0
#> 8 B 1982 0.227 0.227
#> 9 B 1983 0.331 0
#> 10 B 1984 0.341 0
#> 11 C 1980 0.292 0.292
#> 12 C 1981 0.344 0
#> 13 C 1982 0.387 0
#> 14 C 1983 0.251 0
#> 15 C 1984 0.292 0
#> 16 D 1980 NA 0
#> 17 D 1981 NA 0
#> 18 D 1982 NA 0
#> 19 D 1983 0.295 0.295
#> 20 D 1984 0.312 0
数据
set.seed(42)
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
library(tidyverse)
mydata2 <- mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
mutate(year_rank = rank(year)) %>%
mutate(initial = if_else(year_rank == 1, ratio, 0)) %>%
right_join(., mydata, by = c('country', 'year', 'ratio')) %>%
replace_na(list(initial = '0')) %>%
arrange(country, year) %>%
select(-year_rank)
您可以使用 match
获取组中第一个非 NA 值的索引。
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = replace(ratio, -match(TRUE, !is.na(ratio)), 0)) %>%
ungroup
# country year ratio initial
# <chr> <int> <dbl> <dbl>
# 1 A 1980 NA 0
# 2 A 1981 0.268 0.268
# 3 A 1982 0.302 0
# 4 A 1983 0.263 0
# 5 A 1984 0.222 0
# 6 B 1980 NA 0
# 7 B 1981 NA 0
# 8 B 1982 0.397 0.397
# 9 B 1983 0.265 0
#10 B 1984 0.249 0
#11 C 1980 0.302 0.302
#12 C 1981 0.219 0
#13 C 1982 0.339 0
#14 C 1983 0.228 0
#15 C 1984 0.393 0
#16 D 1980 NA 0
#17 D 1981 NA 0
#18 D 1982 NA 0
#19 D 1983 0.303 0.303
#20 D 1984 0.218 0
我正在处理横截面数据集。我想创建一个名为“initial”的新列,它将包含另一列的初始值。进一步来说, 对于每个国家,初始列在数据可用的第一年采用另一列称为“比率”的值,其余所有年份的值都为 0。 示例代码:
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
我想要获得的输出如下所示:
有没有办法在 R 中最好使用 dplyr 包来做到这一点?
非常感谢!
这可能会有所改进,但我得到了您预期的输出:
library(dplyr)
mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
filter(year == min(year)) %>%
rename(initial = ratio) %>%
full_join(., mydata) %>%
mutate(initial = ifelse(is.na(initial), 0, initial)) %>%
arrange(country, year) %>%
relocate(initial, .after = last_col())
输出:
country year ratio initial
<chr> <int> <dbl> <dbl>
1 A 1980 NA 0
2 A 1981 0.341 0.341
3 A 1982 0.330 0
4 A 1983 0.219 0
5 A 1984 0.269 0
6 B 1980 NA 0
7 B 1981 NA 0
8 B 1982 0.365 0.365
9 B 1983 0.210 0
10 B 1984 0.334 0
11 C 1980 0.284 0.284
12 C 1981 0.251 0
13 C 1982 0.358 0
14 C 1983 0.288 0
15 C 1984 0.261 0
16 D 1980 NA 0
17 D 1981 NA 0
18 D 1982 NA 0
19 D 1983 0.252 0.252
20 D 1984 0.301 0
利用 dplyr::first
你可以做:
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = first(ratio[!is.na(ratio)]),
initial = ifelse(is.na(ratio) | ratio != initial, 0, initial)) %>%
ungroup()
#> # A tibble: 20 × 4
#> country year ratio initial
#> <chr> <int> <dbl> <dbl>
#> 1 A 1980 NA 0
#> 2 A 1981 0.387 0.387
#> 3 A 1982 0.257 0
#> 4 A 1983 0.366 0
#> 5 A 1984 0.328 0
#> 6 B 1980 NA 0
#> 7 B 1981 NA 0
#> 8 B 1982 0.227 0.227
#> 9 B 1983 0.331 0
#> 10 B 1984 0.341 0
#> 11 C 1980 0.292 0.292
#> 12 C 1981 0.344 0
#> 13 C 1982 0.387 0
#> 14 C 1983 0.251 0
#> 15 C 1984 0.292 0
#> 16 D 1980 NA 0
#> 17 D 1981 NA 0
#> 18 D 1982 NA 0
#> 19 D 1983 0.295 0.295
#> 20 D 1984 0.312 0
数据
set.seed(42)
country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
library(tidyverse)
mydata2 <- mydata %>%
group_by(country) %>%
filter(!is.na(ratio)) %>%
mutate(year_rank = rank(year)) %>%
mutate(initial = if_else(year_rank == 1, ratio, 0)) %>%
right_join(., mydata, by = c('country', 'year', 'ratio')) %>%
replace_na(list(initial = '0')) %>%
arrange(country, year) %>%
select(-year_rank)
您可以使用 match
获取组中第一个非 NA 值的索引。
library(dplyr)
mydata %>%
group_by(country) %>%
mutate(initial = replace(ratio, -match(TRUE, !is.na(ratio)), 0)) %>%
ungroup
# country year ratio initial
# <chr> <int> <dbl> <dbl>
# 1 A 1980 NA 0
# 2 A 1981 0.268 0.268
# 3 A 1982 0.302 0
# 4 A 1983 0.263 0
# 5 A 1984 0.222 0
# 6 B 1980 NA 0
# 7 B 1981 NA 0
# 8 B 1982 0.397 0.397
# 9 B 1983 0.265 0
#10 B 1984 0.249 0
#11 C 1980 0.302 0.302
#12 C 1981 0.219 0
#13 C 1982 0.339 0
#14 C 1983 0.228 0
#15 C 1984 0.393 0
#16 D 1980 NA 0
#17 D 1981 NA 0
#18 D 1982 NA 0
#19 D 1983 0.303 0.303
#20 D 1984 0.218 0