Row_number() 结合数据集中的缺失数据 (NA)
Row_number() combined with missing data (NA) in dataset
我有以下 df:
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"))
我正在尝试重现这个(即创建 how_long_does_the_person_have_the_car 变量)
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"),
how_long_does_the_person_have_the_car = c(1,2,1,2,3,1,1,2,3,1,2,1,"0/NA", "0/NA", 1,2,1))
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", "1",
2011, "A", "BMW", "2",
2012, "A", "AUDI", "1",
2013, "A", "AUDI", "2",
2014, "A", "AUDI", "3",
2015, "A", "Mercedes", "1",
2010, "B", "Citroen", "1",
2011, "B", "Citroen", "2",
2012, "B", "Citroen", "3",
2013, "B", "Toyota", "1",
2014, "B", "Toyota", "2",
2015, "B", "Peugeot", "1",
2011, "C", "Volkswagen", "0/NA",
2012, "C", "Volkswagen", "0/NA",
2013, "C", "Mercedes", "1",
2014, "C", "Mercedes", "2",
2015, "C", "Tesla", "1"
how_long_does_the_person_have_the_car 变量是变量 car 的累加和,每次汽车或人发生变化时都会重新设置。问题是当我使用这个公式时:
car <- car %>%
group_by(person, car) %>%
mutate( how_long_does_the_person_have_the_car = row_number())
我得到这个输出:
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", 1,
2011, "A", "BMW", 2,
2012, "A", "AUDI", 1,
2013, "A", "AUDI", 2,
2014, "A", "AUDI", 3,
2015, "A", "Mercedes", 1,
2010, "B", "Citroen", 1,
2011, "B", "Citroen", 2,
2012, "B", "Citroen", 3,
2013, "B", "Toyota", 1,
2014, "B", "Toyota", 2,
2015, "B", "Peugeot", 1,
2011, "C", "Volkswagen", 1,
2012, "C", "Volkswagen", 2,
2013, "C", "Mercedes", 1,
2014, "C", "Mercedes", 2,
2015, "C", "Tesla", 1
)
但是,由于我缺少 C 人 2010 年的数据,我希望 C 人在 2011 年和 2012 年的值为 0 或 NA 或其他值,因为缺少 2010 年的数据,我们不知道C人在2011/2012年有his/her大众1年还是2年,还是2/3年,因为我们不知道C人2010年有哪辆车,2013年累计值可以再次从 1 开始计数,现在我们现在知道 C 在 2013 年买了一辆梅赛德斯。
我对你的问题有点困惑,因为你已经有了汽车拥有的累计年数,但缺少一些数据。仅将丢失的数据转换为显式 NA 的 还不够吗?
car$how_long_does_the_person_have_the_car[
car$how_long_does_the_person_have_the_car == '0/NA'
] <- NA
然后您将获得所需的输出。如果有必要再次 cumulate/rank 这些,您可以按所有权变量排名(假设它已经存在),但这只会给您相同的结果:
car <- car %>% #This is assuming explicit NA's in ranking variable
group_by(person, car) %>%
mutate(how_long_does_the_person_have_the_car = row_number(how_long_does_the_person_have_the_car)) #Rank by column
这两个都给出相同的输出:
# A tibble: 17 × 4
# Groups: person, car [9]
year person car how_long_does_the_person_have_the_car
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1
一个可能的解决方案是通过将行号声明为 NA 并使用它来替换值来完成年份,即
library(dplyr)
library(tidyr)
car %>%
group_by(person, car) %>%
mutate(res = row_number()) %>%
ungroup() %>%
complete(year, person) %>%
arrange(person, year, car) %>%
fill(car, .direction = 'updown') %>%
group_by(person, car) %>%
mutate(res1 = replace(res, any(is.na(res)), NA)) %>%
filter(!is.na(res)) %>%
select(-res)
# A tibble: 17 x 4
# Groups: person, car [9]
year person car res1
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1
可能的选项:
library(tidyverse)
car <- data.frame(
stringsAsFactors = FALSE, year = c(2010, 2011, 2012, 2013, 2014, 2015, 2010, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015),
person = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen", "Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen", "Volkswagen", "Mercedes", "Mercedes", "Tesla")
)
car |>
mutate(min_year = min(year)) |>
group_by(person, car) |>
mutate(how_long = row_number()) |>
group_by(person) |>
mutate(
first_car = first(car),
how_long = if_else(first(year) > min_year & car == first_car, NA_integer_, how_long)
) |>
select(-first_car)
#> # A tibble: 17 × 5
#> # Groups: person [3]
#> year person car min_year how_long
#> <dbl> <chr> <chr> <dbl> <int>
#> 1 2010 A BMW 2010 1
#> 2 2011 A BMW 2010 2
#> 3 2012 A AUDI 2010 1
#> 4 2013 A AUDI 2010 2
#> 5 2014 A AUDI 2010 3
#> 6 2015 A Mercedes 2010 1
#> 7 2010 B Citroen 2010 1
#> 8 2011 B Citroen 2010 2
#> 9 2012 B Citroen 2010 3
#> 10 2013 B Toyota 2010 1
#> 11 2014 B Toyota 2010 2
#> 12 2015 B Peugeot 2010 1
#> 13 2011 C Volkswagen 2010 NA
#> 14 2012 C Volkswagen 2010 NA
#> 15 2013 C Mercedes 2010 1
#> 16 2014 C Mercedes 2010 2
#> 17 2015 C Tesla 2010 1
由 reprex package (v2.0.1)
创建于 2022-06-02
我有以下 df:
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"))
我正在尝试重现这个(即创建 how_long_does_the_person_have_the_car 变量)
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"),
how_long_does_the_person_have_the_car = c(1,2,1,2,3,1,1,2,3,1,2,1,"0/NA", "0/NA", 1,2,1))
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", "1",
2011, "A", "BMW", "2",
2012, "A", "AUDI", "1",
2013, "A", "AUDI", "2",
2014, "A", "AUDI", "3",
2015, "A", "Mercedes", "1",
2010, "B", "Citroen", "1",
2011, "B", "Citroen", "2",
2012, "B", "Citroen", "3",
2013, "B", "Toyota", "1",
2014, "B", "Toyota", "2",
2015, "B", "Peugeot", "1",
2011, "C", "Volkswagen", "0/NA",
2012, "C", "Volkswagen", "0/NA",
2013, "C", "Mercedes", "1",
2014, "C", "Mercedes", "2",
2015, "C", "Tesla", "1"
how_long_does_the_person_have_the_car 变量是变量 car 的累加和,每次汽车或人发生变化时都会重新设置。问题是当我使用这个公式时:
car <- car %>%
group_by(person, car) %>%
mutate( how_long_does_the_person_have_the_car = row_number())
我得到这个输出:
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", 1,
2011, "A", "BMW", 2,
2012, "A", "AUDI", 1,
2013, "A", "AUDI", 2,
2014, "A", "AUDI", 3,
2015, "A", "Mercedes", 1,
2010, "B", "Citroen", 1,
2011, "B", "Citroen", 2,
2012, "B", "Citroen", 3,
2013, "B", "Toyota", 1,
2014, "B", "Toyota", 2,
2015, "B", "Peugeot", 1,
2011, "C", "Volkswagen", 1,
2012, "C", "Volkswagen", 2,
2013, "C", "Mercedes", 1,
2014, "C", "Mercedes", 2,
2015, "C", "Tesla", 1
)
但是,由于我缺少 C 人 2010 年的数据,我希望 C 人在 2011 年和 2012 年的值为 0 或 NA 或其他值,因为缺少 2010 年的数据,我们不知道C人在2011/2012年有his/her大众1年还是2年,还是2/3年,因为我们不知道C人2010年有哪辆车,2013年累计值可以再次从 1 开始计数,现在我们现在知道 C 在 2013 年买了一辆梅赛德斯。
我对你的问题有点困惑,因为你已经有了汽车拥有的累计年数,但缺少一些数据。仅将丢失的数据转换为显式 NA 的 还不够吗?
car$how_long_does_the_person_have_the_car[
car$how_long_does_the_person_have_the_car == '0/NA'
] <- NA
然后您将获得所需的输出。如果有必要再次 cumulate/rank 这些,您可以按所有权变量排名(假设它已经存在),但这只会给您相同的结果:
car <- car %>% #This is assuming explicit NA's in ranking variable
group_by(person, car) %>%
mutate(how_long_does_the_person_have_the_car = row_number(how_long_does_the_person_have_the_car)) #Rank by column
这两个都给出相同的输出:
# A tibble: 17 × 4
# Groups: person, car [9]
year person car how_long_does_the_person_have_the_car
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1
一个可能的解决方案是通过将行号声明为 NA 并使用它来替换值来完成年份,即
library(dplyr)
library(tidyr)
car %>%
group_by(person, car) %>%
mutate(res = row_number()) %>%
ungroup() %>%
complete(year, person) %>%
arrange(person, year, car) %>%
fill(car, .direction = 'updown') %>%
group_by(person, car) %>%
mutate(res1 = replace(res, any(is.na(res)), NA)) %>%
filter(!is.na(res)) %>%
select(-res)
# A tibble: 17 x 4
# Groups: person, car [9]
year person car res1
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1
可能的选项:
library(tidyverse)
car <- data.frame(
stringsAsFactors = FALSE, year = c(2010, 2011, 2012, 2013, 2014, 2015, 2010, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015),
person = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen", "Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen", "Volkswagen", "Mercedes", "Mercedes", "Tesla")
)
car |>
mutate(min_year = min(year)) |>
group_by(person, car) |>
mutate(how_long = row_number()) |>
group_by(person) |>
mutate(
first_car = first(car),
how_long = if_else(first(year) > min_year & car == first_car, NA_integer_, how_long)
) |>
select(-first_car)
#> # A tibble: 17 × 5
#> # Groups: person [3]
#> year person car min_year how_long
#> <dbl> <chr> <chr> <dbl> <int>
#> 1 2010 A BMW 2010 1
#> 2 2011 A BMW 2010 2
#> 3 2012 A AUDI 2010 1
#> 4 2013 A AUDI 2010 2
#> 5 2014 A AUDI 2010 3
#> 6 2015 A Mercedes 2010 1
#> 7 2010 B Citroen 2010 1
#> 8 2011 B Citroen 2010 2
#> 9 2012 B Citroen 2010 3
#> 10 2013 B Toyota 2010 1
#> 11 2014 B Toyota 2010 2
#> 12 2015 B Peugeot 2010 1
#> 13 2011 C Volkswagen 2010 NA
#> 14 2012 C Volkswagen 2010 NA
#> 15 2013 C Mercedes 2010 1
#> 16 2014 C Mercedes 2010 2
#> 17 2015 C Tesla 2010 1
由 reprex package (v2.0.1)
创建于 2022-06-02