在 R 中填充数据框中的行
Fill up rows in a dataframe in R
我有一个数据框,其中包含客户从 5 月 (202105) 到 2021 年 10 月 (202110) 的购买信息。客户不是每个月都购买,但我想用零填充该信息。我的数据如下所示:
soex <- data.frame(client_id = c("aaa","bbb","bbb","ccc","ccc","ddd","eee","eee","eee"),
v1 = c("xxx","xxx","xxx","yyy","yyy","xxx","yyy","xxx","yyy"),
first_buy = c("202105","202107","202107","202106","202106","202110","202107","202107","202107"),
sales_date = c("202105","202107","202109","202106","202110","202110","202107","202108","202109"),
qt_prod1 = c(10,60,30,2,45,11,14,167,145),
qt_prod2 = c(12,324,433,221,312,312,312,123,121))
client_id v1 first_buy sales_date qt_prod1 qt_prod2
1 aaa xxx 202105 202105 10 12
2 bbb xxx 202107 202107 60 324
3 bbb xxx 202107 202109 30 433
4 ccc yyy 202106 202106 2 221
5 ccc yyy 202106 202110 45 312
6 ddd xxx 202110 202110 11 312
7 eee yyy 202107 202107 14 312
8 eee xxx 202107 202108 167 123
9 eee yyy 202107 202109 145 121
- client_id = 客户编号
- v1 = 随机变量
- first_buy = 首次购买的年月
- sales_date = 购买的年份和月份。第一个始终与 first_buy
相同
- qi_prod1(和 2)= 购买的产品数量
我需要的是一个如下所示的数据框:
ideal <- data.frame(client_id = c("aaa","aaa","aaa","aaa","aaa","aaa","bbb","bbb","bbb","bbb","ccc","ccc","ccc","ccc","ccc","ddd","eee","eee","eee","eee"),
v1 = c("xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","yyy","yyy","yyy","yyy","yyy","xxx","yyy","xxx","yyy","yyy"),
first_buy = c("202105","202105","202105","202105","202105","202105","202107","202107","202107","202107","202106","202106","202106","202106","202106","202110","202107","202107","202107","202107"),
sales_date = c("202105","202106","202107","202108","202109","202110","202107","202108","202109","202110","202106","202107","202108","202109","202110","202110","202107","202108","202109","202110"),
qt_prod1 = c(10,0,0,0,0,0,60,0,0,30,2,0,0,0,45,11,14,167,145,0),
qt_prod2 = c(12,0,0,0,0,0,324,0,0,433,221,0,0,0,312,312,312,123,121,0))
client_id v1 first_buy sales_date qt_prod1 qt_prod2
1 aaa xxx 202105 202105 10 12
2 aaa xxx 202105 202106 0 0
3 aaa xxx 202105 202107 0 0
4 aaa xxx 202105 202108 0 0
5 aaa xxx 202105 202109 0 0
6 aaa xxx 202105 202110 0 0
7 bbb xxx 202107 202107 60 324
8 bbb xxx 202107 202108 0 0
9 bbb xxx 202107 202109 0 0
10 bbb xxx 202107 202110 30 433
11 ccc yyy 202106 202106 2 221
12 ccc yyy 202106 202107 0 0
13 ccc yyy 202106 202108 0 0
14 ccc yyy 202106 202109 0 0
15 ccc yyy 202106 202110 45 312
16 ddd xxx 202110 202110 11 312
17 eee yyy 202107 202107 14 312
18 eee xxx 202107 202108 167 123
19 eee yyy 202107 202109 145 121
20 eee yyy 202107 202110 0 0
当我不得不考虑 first_buy 变量时,我的问题就来了。正如您在“bbb”客户端中看到的那样,我不希望数据从 2021 年 5 月开始……我希望它从 first_buy 月开始,直到 202110。
我的另一个问题是用sales_date的信息填充V1变量的信息。如果你看,比如“eee”客户的案例,她在202110没有购买,但是在V1她有202109的信息
谢谢,
一个tidyverse
选项-
- 将
sales_date
更改为日期对象
- 为每个
client_id
创建一个每月的日期序列
fill
first_buy
和 v1
列。
library(dplyr)
library(tidyr)
library(lubridate)
soex %>%
mutate(sales_date = ymd(paste0(sales_date, '01'))) %>%
group_by(client_id) %>%
complete(sales_date = seq(min(sales_date), ymd('20211001'), by = 'month'),
fill = list(qt_prod1 = 0, qt_prod2 = 0)) %>%
mutate(sales_date = format(sales_date, '%Y%m')) %>%
fill(first_buy, v1) %>%
ungroup()
# client_id sales_date v1 first_buy qt_prod1 qt_prod2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 aaa 202105 xxx 202105 10 12
# 2 aaa 202106 xxx 202105 0 0
# 3 aaa 202107 xxx 202105 0 0
# 4 aaa 202108 xxx 202105 0 0
# 5 aaa 202109 xxx 202105 0 0
# 6 aaa 202110 xxx 202105 0 0
# 7 bbb 202107 xxx 202107 60 324
# 8 bbb 202108 xxx 202107 0 0
# 9 bbb 202109 xxx 202107 30 433
#10 bbb 202110 xxx 202107 0 0
#11 ccc 202106 yyy 202106 2 221
#12 ccc 202107 yyy 202106 0 0
#13 ccc 202108 yyy 202106 0 0
#14 ccc 202109 yyy 202106 0 0
#15 ccc 202110 yyy 202106 45 312
#16 ddd 202110 xxx 202110 11 312
#17 eee 202107 yyy 202107 14 312
#18 eee 202108 xxx 202107 167 123
#19 eee 202109 yyy 202107 145 121
#20 eee 202110 yyy 202107 0 0
我有一个数据框,其中包含客户从 5 月 (202105) 到 2021 年 10 月 (202110) 的购买信息。客户不是每个月都购买,但我想用零填充该信息。我的数据如下所示:
soex <- data.frame(client_id = c("aaa","bbb","bbb","ccc","ccc","ddd","eee","eee","eee"),
v1 = c("xxx","xxx","xxx","yyy","yyy","xxx","yyy","xxx","yyy"),
first_buy = c("202105","202107","202107","202106","202106","202110","202107","202107","202107"),
sales_date = c("202105","202107","202109","202106","202110","202110","202107","202108","202109"),
qt_prod1 = c(10,60,30,2,45,11,14,167,145),
qt_prod2 = c(12,324,433,221,312,312,312,123,121))
client_id v1 first_buy sales_date qt_prod1 qt_prod2
1 aaa xxx 202105 202105 10 12
2 bbb xxx 202107 202107 60 324
3 bbb xxx 202107 202109 30 433
4 ccc yyy 202106 202106 2 221
5 ccc yyy 202106 202110 45 312
6 ddd xxx 202110 202110 11 312
7 eee yyy 202107 202107 14 312
8 eee xxx 202107 202108 167 123
9 eee yyy 202107 202109 145 121
- client_id = 客户编号
- v1 = 随机变量
- first_buy = 首次购买的年月
- sales_date = 购买的年份和月份。第一个始终与 first_buy 相同
- qi_prod1(和 2)= 购买的产品数量
我需要的是一个如下所示的数据框:
ideal <- data.frame(client_id = c("aaa","aaa","aaa","aaa","aaa","aaa","bbb","bbb","bbb","bbb","ccc","ccc","ccc","ccc","ccc","ddd","eee","eee","eee","eee"),
v1 = c("xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","xxx","yyy","yyy","yyy","yyy","yyy","xxx","yyy","xxx","yyy","yyy"),
first_buy = c("202105","202105","202105","202105","202105","202105","202107","202107","202107","202107","202106","202106","202106","202106","202106","202110","202107","202107","202107","202107"),
sales_date = c("202105","202106","202107","202108","202109","202110","202107","202108","202109","202110","202106","202107","202108","202109","202110","202110","202107","202108","202109","202110"),
qt_prod1 = c(10,0,0,0,0,0,60,0,0,30,2,0,0,0,45,11,14,167,145,0),
qt_prod2 = c(12,0,0,0,0,0,324,0,0,433,221,0,0,0,312,312,312,123,121,0))
client_id v1 first_buy sales_date qt_prod1 qt_prod2
1 aaa xxx 202105 202105 10 12
2 aaa xxx 202105 202106 0 0
3 aaa xxx 202105 202107 0 0
4 aaa xxx 202105 202108 0 0
5 aaa xxx 202105 202109 0 0
6 aaa xxx 202105 202110 0 0
7 bbb xxx 202107 202107 60 324
8 bbb xxx 202107 202108 0 0
9 bbb xxx 202107 202109 0 0
10 bbb xxx 202107 202110 30 433
11 ccc yyy 202106 202106 2 221
12 ccc yyy 202106 202107 0 0
13 ccc yyy 202106 202108 0 0
14 ccc yyy 202106 202109 0 0
15 ccc yyy 202106 202110 45 312
16 ddd xxx 202110 202110 11 312
17 eee yyy 202107 202107 14 312
18 eee xxx 202107 202108 167 123
19 eee yyy 202107 202109 145 121
20 eee yyy 202107 202110 0 0
当我不得不考虑 first_buy 变量时,我的问题就来了。正如您在“bbb”客户端中看到的那样,我不希望数据从 2021 年 5 月开始……我希望它从 first_buy 月开始,直到 202110。
我的另一个问题是用sales_date的信息填充V1变量的信息。如果你看,比如“eee”客户的案例,她在202110没有购买,但是在V1她有202109的信息
谢谢,
一个tidyverse
选项-
- 将
sales_date
更改为日期对象 - 为每个
client_id
创建一个每月的日期序列 fill
first_buy
和v1
列。
library(dplyr)
library(tidyr)
library(lubridate)
soex %>%
mutate(sales_date = ymd(paste0(sales_date, '01'))) %>%
group_by(client_id) %>%
complete(sales_date = seq(min(sales_date), ymd('20211001'), by = 'month'),
fill = list(qt_prod1 = 0, qt_prod2 = 0)) %>%
mutate(sales_date = format(sales_date, '%Y%m')) %>%
fill(first_buy, v1) %>%
ungroup()
# client_id sales_date v1 first_buy qt_prod1 qt_prod2
# <chr> <chr> <chr> <chr> <dbl> <dbl>
# 1 aaa 202105 xxx 202105 10 12
# 2 aaa 202106 xxx 202105 0 0
# 3 aaa 202107 xxx 202105 0 0
# 4 aaa 202108 xxx 202105 0 0
# 5 aaa 202109 xxx 202105 0 0
# 6 aaa 202110 xxx 202105 0 0
# 7 bbb 202107 xxx 202107 60 324
# 8 bbb 202108 xxx 202107 0 0
# 9 bbb 202109 xxx 202107 30 433
#10 bbb 202110 xxx 202107 0 0
#11 ccc 202106 yyy 202106 2 221
#12 ccc 202107 yyy 202106 0 0
#13 ccc 202108 yyy 202106 0 0
#14 ccc 202109 yyy 202106 0 0
#15 ccc 202110 yyy 202106 45 312
#16 ddd 202110 xxx 202110 11 312
#17 eee 202107 yyy 202107 14 312
#18 eee 202108 xxx 202107 167 123
#19 eee 202109 yyy 202107 145 121
#20 eee 202110 yyy 202107 0 0