创建一个新列,该列采用 R 中另一列的初始值

creating a new column that takes initial values of another column in R

我正在处理横截面数据集。我想创建一个名为“initial”的新列,它将包含另一列的初始值。进一步来说, 对于每个国家,初始列在数据可用的第一年采用另一列称为“比率”的值,其余所有年份的值都为 0。 示例代码:

country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA

我想要获得的输出如下所示:

有没有办法在 R 中最好使用 dplyr 包来做到这一点?

非常感谢!

这可能会有所改进,但我得到了您预期的输出:

library(dplyr)

mydata %>% 
  group_by(country) %>% 
  filter(!is.na(ratio)) %>% 
  filter(year == min(year)) %>% 
  rename(initial = ratio) %>% 
  full_join(., mydata) %>% 
  mutate(initial = ifelse(is.na(initial), 0, initial)) %>% 
  arrange(country, year) %>% 
  relocate(initial, .after = last_col())

输出:

  country  year  ratio initial
   <chr>   <int>  <dbl>   <dbl>
 1 A        1980 NA       0    
 2 A        1981  0.341   0.341
 3 A        1982  0.330   0    
 4 A        1983  0.219   0    
 5 A        1984  0.269   0    
 6 B        1980 NA       0    
 7 B        1981 NA       0    
 8 B        1982  0.365   0.365
 9 B        1983  0.210   0    
10 B        1984  0.334   0    
11 C        1980  0.284   0.284
12 C        1981  0.251   0    
13 C        1982  0.358   0    
14 C        1983  0.288   0    
15 C        1984  0.261   0    
16 D        1980 NA       0    
17 D        1981 NA       0    
18 D        1982 NA       0    
19 D        1983  0.252   0.252
20 D        1984  0.301   0   

利用 dplyr::first 你可以做:


library(dplyr)

mydata %>% 
  group_by(country) %>% 
  mutate(initial = first(ratio[!is.na(ratio)]),
         initial = ifelse(is.na(ratio) | ratio != initial, 0, initial)) %>% 
  ungroup()
#> # A tibble: 20 × 4
#>    country  year  ratio initial
#>    <chr>   <int>  <dbl>   <dbl>
#>  1 A        1980 NA       0    
#>  2 A        1981  0.387   0.387
#>  3 A        1982  0.257   0    
#>  4 A        1983  0.366   0    
#>  5 A        1984  0.328   0    
#>  6 B        1980 NA       0    
#>  7 B        1981 NA       0    
#>  8 B        1982  0.227   0.227
#>  9 B        1983  0.331   0    
#> 10 B        1984  0.341   0    
#> 11 C        1980  0.292   0.292
#> 12 C        1981  0.344   0    
#> 13 C        1982  0.387   0    
#> 14 C        1983  0.251   0    
#> 15 C        1984  0.292   0    
#> 16 D        1980 NA       0    
#> 17 D        1981 NA       0    
#> 18 D        1982 NA       0    
#> 19 D        1983  0.295   0.295
#> 20 D        1984  0.312   0

数据

set.seed(42)

country <- c(rep(c("A","B","C","D"),each=5))
year <- c(1980:1984, 1980: 1984, 1980:1984, 1980:1984)
ratio <- runif(n = 20, min = 0.20, max = 0.40)
mydata <- data.frame(country, year, ratio)
mydata$ratio[[1]] <- NA
mydata$ratio[6:7] <- NA
mydata$ratio[16:18] <- NA
library(tidyverse)

mydata2 <- mydata %>% 
  group_by(country) %>% 
  filter(!is.na(ratio)) %>% 
  mutate(year_rank = rank(year)) %>% 
  mutate(initial = if_else(year_rank == 1, ratio, 0)) %>% 
  right_join(., mydata, by = c('country', 'year', 'ratio')) %>% 
  replace_na(list(initial = '0')) %>% 
  arrange(country, year) %>% 
  select(-year_rank)

您可以使用 match 获取组中第一个非 NA 值的索引。

library(dplyr)

mydata %>%
  group_by(country) %>%
  mutate(initial = replace(ratio, -match(TRUE, !is.na(ratio)), 0)) %>%
  ungroup

#  country  year  ratio initial
#   <chr>   <int>  <dbl>   <dbl>
# 1 A        1980 NA       0    
# 2 A        1981  0.268   0.268
# 3 A        1982  0.302   0    
# 4 A        1983  0.263   0    
# 5 A        1984  0.222   0    
# 6 B        1980 NA       0    
# 7 B        1981 NA       0    
# 8 B        1982  0.397   0.397
# 9 B        1983  0.265   0    
#10 B        1984  0.249   0    
#11 C        1980  0.302   0.302
#12 C        1981  0.219   0    
#13 C        1982  0.339   0    
#14 C        1983  0.228   0    
#15 C        1984  0.393   0    
#16 D        1980 NA       0    
#17 D        1981 NA       0    
#18 D        1982 NA       0    
#19 D        1983  0.303   0.303
#20 D        1984  0.218   0