尝试在数据框 R 组内按顺序标记
Trying to label sequentially within groups of dataframe R
我有我的数据框的一个子集:
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204))
其中 2017010104 对应日期 01/01/2017 等等。我试图参考年份按顺序标记这些不同的 quad_ids。因此,例如我正在尝试获取输出:
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204),
Snum = c(1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4))
您可以看到 retailer_id = 2,store_id = 167,2017 年的周标记为 1-14,然后当周从 2018 年开始时,它开始按顺序从1 直到达到该分组中从 2019 年开始的一周。
我试过了:
DT <- data.table(df)
DT[, Snum := seq_len(.N), by = list(retailer_id, store_id)]
然而,这不是按年份顺序标记,而是按store_id顺序标记。有没有办法来解决这个问题? (此示例代码仅显示两个零售商和两个商店,而我的实际数据框和数百个不同的零售商和商店)
这是一个使用 tidyverse
的解决方案
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204))
library(tidyverse)
getYear = function(x) {
x %>%
str_extract("^\d{4}") %>%
as.integer() %>%
return()
}
tmp = df %>%
mutate(year = getYear(quad_id)) %>%
group_by(year, retailer_id, store_id) %>%
mutate(Snum = 1:n())
> tmp
# A tibble: 20 x 5
# Groups: year, retailer_id, store_id [3]
retailer_id store_id quad_id year Snum
<dbl> <dbl> <dbl> <int> <int>
1 1 166 2017010104 2017 1
2 1 166 2017012904 2017 2
3 2 167 2017010104 2017 1
4 2 167 2017012904 2017 2
5 2 167 2017022604 2017 3
6 2 167 2017032604 2017 4
7 2 167 2017042304 2017 5
8 2 167 2017052104 2017 6
9 2 167 2017061804 2017 7
10 2 167 2017071604 2017 8
11 2 167 2017081304 2017 9
12 2 167 2017091004 2017 10
13 2 167 2017100804 2017 11
14 2 167 2017110504 2017 12
15 2 167 2017120304 2017 13
16 2 167 2017123104 2017 14
17 2 167 2018012804 2018 1
18 2 167 2018022504 2018 2
19 2 167 2018032504 2018 3
20 2 167 2018042204 2018 4
请注意,如果您的数据未按 retailer_id
、store_id
和 year
排序,则会导致问题。
我们可以使用 stringr
包中的 str_match
和正则表达式 '^[[:digit:]]{4}'
来匹配前四位数字:
library(dplyr)
library(stringr)
df %>%
group_by(Snum = str_match(quad_id, '^[[:digit:]]{4}')) %>%
mutate(Snum = row_number())
输出:
retailer_id store_id quad_id Snum
<dbl> <dbl> <dbl> <int>
1 1 166 2017010104 1
2 1 166 2017012904 2
3 2 167 2017010104 3
4 2 167 2017012904 4
5 2 167 2017022604 5
6 2 167 2017032604 6
7 2 167 2017042304 7
8 2 167 2017052104 8
9 2 167 2017061804 9
10 2 167 2017071604 10
11 2 167 2017081304 11
12 2 167 2017091004 12
13 2 167 2017100804 13
14 2 167 2017110504 14
15 2 167 2017120304 15
16 2 167 2017123104 16
17 2 167 2018012804 1
18 2 167 2018022504 2
19 2 167 2018032504 3
20 2 167 2018042204 4
我有我的数据框的一个子集:
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204))
其中 2017010104 对应日期 01/01/2017 等等。我试图参考年份按顺序标记这些不同的 quad_ids。因此,例如我正在尝试获取输出:
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204),
Snum = c(1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4))
您可以看到 retailer_id = 2,store_id = 167,2017 年的周标记为 1-14,然后当周从 2018 年开始时,它开始按顺序从1 直到达到该分组中从 2019 年开始的一周。
我试过了:
DT <- data.table(df)
DT[, Snum := seq_len(.N), by = list(retailer_id, store_id)]
然而,这不是按年份顺序标记,而是按store_id顺序标记。有没有办法来解决这个问题? (此示例代码仅显示两个零售商和两个商店,而我的实际数据框和数百个不同的零售商和商店)
这是一个使用 tidyverse
df = data.frame(retailer_id = c(1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
store_id = c(166, 166, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167),
quad_id = c(2017010104, 2017012904, 2017010104, 2017012904, 2017022604, 2017032604 ,2017042304, 2017052104, 2017061804,
2017071604, 2017081304, 2017091004, 2017100804, 2017110504, 2017120304, 2017123104, 2018012804, 2018022504, 2018032504, 2018042204))
library(tidyverse)
getYear = function(x) {
x %>%
str_extract("^\d{4}") %>%
as.integer() %>%
return()
}
tmp = df %>%
mutate(year = getYear(quad_id)) %>%
group_by(year, retailer_id, store_id) %>%
mutate(Snum = 1:n())
> tmp
# A tibble: 20 x 5
# Groups: year, retailer_id, store_id [3]
retailer_id store_id quad_id year Snum
<dbl> <dbl> <dbl> <int> <int>
1 1 166 2017010104 2017 1
2 1 166 2017012904 2017 2
3 2 167 2017010104 2017 1
4 2 167 2017012904 2017 2
5 2 167 2017022604 2017 3
6 2 167 2017032604 2017 4
7 2 167 2017042304 2017 5
8 2 167 2017052104 2017 6
9 2 167 2017061804 2017 7
10 2 167 2017071604 2017 8
11 2 167 2017081304 2017 9
12 2 167 2017091004 2017 10
13 2 167 2017100804 2017 11
14 2 167 2017110504 2017 12
15 2 167 2017120304 2017 13
16 2 167 2017123104 2017 14
17 2 167 2018012804 2018 1
18 2 167 2018022504 2018 2
19 2 167 2018032504 2018 3
20 2 167 2018042204 2018 4
请注意,如果您的数据未按 retailer_id
、store_id
和 year
排序,则会导致问题。
我们可以使用 stringr
包中的 str_match
和正则表达式 '^[[:digit:]]{4}'
来匹配前四位数字:
library(dplyr)
library(stringr)
df %>%
group_by(Snum = str_match(quad_id, '^[[:digit:]]{4}')) %>%
mutate(Snum = row_number())
输出:
retailer_id store_id quad_id Snum
<dbl> <dbl> <dbl> <int>
1 1 166 2017010104 1
2 1 166 2017012904 2
3 2 167 2017010104 3
4 2 167 2017012904 4
5 2 167 2017022604 5
6 2 167 2017032604 6
7 2 167 2017042304 7
8 2 167 2017052104 8
9 2 167 2017061804 9
10 2 167 2017071604 10
11 2 167 2017081304 11
12 2 167 2017091004 12
13 2 167 2017100804 13
14 2 167 2017110504 14
15 2 167 2017120304 15
16 2 167 2017123104 16
17 2 167 2018012804 1
18 2 167 2018022504 2
19 2 167 2018032504 3
20 2 167 2018042204 4