在单独的列中每小时过去后获取字符列的模式?
Getting the mode of a character column after every hour elapsed in separate column?
我有一个非常大的数据集 - 大约 2000 万个观测值,这是它的基本结构 -
date time string
1 01/01/2020 20:00:00 A
2 01/01/2020 20:13:12 B
3 01/01/2020 20:37:45 C
4 01/01/2020 20:39:07 D
5 01/01/2020 20:41:29 A
6 01/01/2020 20:46:48 E
7 01/01/2020 21:00:00 J
我想要一个新列,也许是“mode”,它会计算“string”列中最常出现的文本字符串,但仅以小时为间隔。所以 table 最终会变成这样 -
date time string mode
1 01/01/2020 20:00:00 A
2 01/01/2020 20:13:12 B
3 01/01/2020 20:37:45 C
4 01/01/2020 20:39:07 D
5 01/01/2020 20:41:29 A
6 01/01/2020 20:46:48 E
7 01/01/2020 21:00:00 J A
8 01/01/2020 21:20:12 I
9 01/01/2020 21:38:32 I
10 01/01/2020 21:43:12 A
11 01/01/2020 21:49:50 I
12 01/01/2020 21:54:50 B
13 01/01/2020 22:00:00 A I
14 01/01/2020 22:03:45 B
因此每次小时数字递增时,都会对字符串列采用一种模式,然后模式测量会在下一个小时长的间隔内重置。
我是 R 的新手,所以很遗憾,我没有任何尝试或错误消息可以显示。我看过许多其他类似的主题/线程,但没有遇到任何帮助我找到可行解决方案的东西。当然不会要求任何人为我编写代码 - 任何建议都将不胜感激。
一个选项是使用 here
中的 Mode
函数
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
通过 paste
对 'date'、'time' 列创建分组变量,将其转换为具有 dmy_hms
的日期时间 class(来自 lubridate
),然后使用 ceiling_date
将“1 小时”指定为 unit
,通过在 'string' 列上应用 Mode
创建 'mode' 列,然后使用case_when
到 return 该值仅在每个组的最后一行
library(dplyr)
library(lubridate
library(stringr)
df1 %>%
group_by(grp = ceiling_date(dmy_hms(str_c(date, time, sep=" ")),
'1 hour')) %>%
mutate(mode = case_when(row_number() == n() ~ Mode(string),
TRUE ~ "")) %>%
ungroup %>%
select(-grp)
-输出
# A tibble: 14 x 4
# date time string mode
# <chr> <chr> <chr> <chr>
# 1 01/01/2020 20:04:01 A ""
# 2 01/01/2020 20:13:12 B ""
# 3 01/01/2020 20:37:45 C ""
# 4 01/01/2020 20:39:07 D ""
# 5 01/01/2020 20:41:29 A ""
# 6 01/01/2020 20:46:48 E ""
# 7 01/01/2020 21:00:00 J "A"
# 8 01/01/2020 21:20:12 I ""
# 9 01/01/2020 21:38:32 I ""
#10 01/01/2020 21:43:12 A ""
#11 01/01/2020 21:49:50 I ""
#12 01/01/2020 21:54:50 B ""
#13 01/01/2020 22:00:00 A "I"
#14 01/01/2020 22:03:45 B "B"
数据
df1 <- structure(list(date = c("01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020"), time = c("20:04:01", "20:13:12", "20:37:45", "20:39:07",
"20:41:29", "20:46:48", "21:00:00", "21:20:12", "21:38:32", "21:43:12",
"21:49:50", "21:54:50", "22:00:00", "22:03:45"), string = c("A",
"B", "C", "D", "A", "E", "J", "I", "I", "A", "I", "B", "A", "B"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "11", "12", "13", "14"))
使用library(data.table)
我们可以做到
setDT(df1)[, hour := paste(date, sub(':.+','', time))]
df1[, n := seq(.N), by = .(hour, string)]
df1[, mode := string[which.max(n)], by=hour]
我有一个非常大的数据集 - 大约 2000 万个观测值,这是它的基本结构 -
date time string
1 01/01/2020 20:00:00 A
2 01/01/2020 20:13:12 B
3 01/01/2020 20:37:45 C
4 01/01/2020 20:39:07 D
5 01/01/2020 20:41:29 A
6 01/01/2020 20:46:48 E
7 01/01/2020 21:00:00 J
我想要一个新列,也许是“mode”,它会计算“string”列中最常出现的文本字符串,但仅以小时为间隔。所以 table 最终会变成这样 -
date time string mode
1 01/01/2020 20:00:00 A
2 01/01/2020 20:13:12 B
3 01/01/2020 20:37:45 C
4 01/01/2020 20:39:07 D
5 01/01/2020 20:41:29 A
6 01/01/2020 20:46:48 E
7 01/01/2020 21:00:00 J A
8 01/01/2020 21:20:12 I
9 01/01/2020 21:38:32 I
10 01/01/2020 21:43:12 A
11 01/01/2020 21:49:50 I
12 01/01/2020 21:54:50 B
13 01/01/2020 22:00:00 A I
14 01/01/2020 22:03:45 B
因此每次小时数字递增时,都会对字符串列采用一种模式,然后模式测量会在下一个小时长的间隔内重置。
我是 R 的新手,所以很遗憾,我没有任何尝试或错误消息可以显示。我看过许多其他类似的主题/线程,但没有遇到任何帮助我找到可行解决方案的东西。当然不会要求任何人为我编写代码 - 任何建议都将不胜感激。
一个选项是使用 here
中的Mode
函数
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
通过 paste
对 'date'、'time' 列创建分组变量,将其转换为具有 dmy_hms
的日期时间 class(来自 lubridate
),然后使用 ceiling_date
将“1 小时”指定为 unit
,通过在 'string' 列上应用 Mode
创建 'mode' 列,然后使用case_when
到 return 该值仅在每个组的最后一行
library(dplyr)
library(lubridate
library(stringr)
df1 %>%
group_by(grp = ceiling_date(dmy_hms(str_c(date, time, sep=" ")),
'1 hour')) %>%
mutate(mode = case_when(row_number() == n() ~ Mode(string),
TRUE ~ "")) %>%
ungroup %>%
select(-grp)
-输出
# A tibble: 14 x 4
# date time string mode
# <chr> <chr> <chr> <chr>
# 1 01/01/2020 20:04:01 A ""
# 2 01/01/2020 20:13:12 B ""
# 3 01/01/2020 20:37:45 C ""
# 4 01/01/2020 20:39:07 D ""
# 5 01/01/2020 20:41:29 A ""
# 6 01/01/2020 20:46:48 E ""
# 7 01/01/2020 21:00:00 J "A"
# 8 01/01/2020 21:20:12 I ""
# 9 01/01/2020 21:38:32 I ""
#10 01/01/2020 21:43:12 A ""
#11 01/01/2020 21:49:50 I ""
#12 01/01/2020 21:54:50 B ""
#13 01/01/2020 22:00:00 A "I"
#14 01/01/2020 22:03:45 B "B"
数据
df1 <- structure(list(date = c("01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020", "01/01/2020",
"01/01/2020"), time = c("20:04:01", "20:13:12", "20:37:45", "20:39:07",
"20:41:29", "20:46:48", "21:00:00", "21:20:12", "21:38:32", "21:43:12",
"21:49:50", "21:54:50", "22:00:00", "22:03:45"), string = c("A",
"B", "C", "D", "A", "E", "J", "I", "I", "A", "I", "B", "A", "B"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "11", "12", "13", "14"))
使用library(data.table)
我们可以做到
setDT(df1)[, hour := paste(date, sub(':.+','', time))]
df1[, n := seq(.N), by = .(hour, string)]
df1[, mode := string[which.max(n)], by=hour]