r 沿向量搜索并计算平均值
r search along a vector and calculate the mean
我的数据如下:
require(data.table)
DT <- data.table(x=c(19,19,19,21,21,19,19,22,22,22),
y=c(53,54,55,32,44,45,49,56,57,58))
我想沿着 x 搜索,并计算 y 的均值。
但是,当使用.
DT[, .(my=mean(y)), by=.(x)]
我得到了 x 的重合值的总体均值。
我想沿着 x 搜索,每次 x 变化时,我都想计算一个新的均值。对于提供的示例,输出将是:
DTans <- data.table(x=c(19,21,19,22),
my=c(54,38,47,57))
我们可以使用 rleid
创建另一个分组变量,获取 'y' 的 mean
,并将 'indx' 分配给 NULL
library(data.table) # v 1.9.5+
DT[, .(my = mean(y)), by = .(indx = rleid(x), x)][, indx := NULL]
# x my
#1: 19 54
#2: 21 38
#3: 19 47
#4: 22 57
基准
set.seed(24)
foo <- function(x) sample(x, 1e7L, replace = TRUE)
DT <- data.table(x = foo(100L), y = foo(10000L))
josilber <- function() {
new.group <- c(1, diff(DT$x) != 0)
res <- data.table(x = DT$x[new.group == 1],
my = tapply(DT$y, cumsum(new.group), mean))
}
Roland <- function() {
DT[, .(my = mean(y), x = x[1]), by = cumsum(c(1, diff(x) != 0))]
}
akrun <- function() {
DT[, .(my = mean(y)), by = .(indx = rleid(x), x)][,indx := NULL]
}
bgoldst <- function() {
with(rle(DT$x), data.frame(x = values,
my = tapply(DT$y, rep(1:length(lengths), lengths), mean)))
}
system.time(josilber())
# user system elapsed
#159.405 1.759 161.110
system.time(bgoldst())
# user system elapsed
#162.628 0.782 163.380
system.time(Roland())
# user system elapsed
# 18.633 0.052 18.678
system.time(akrun())
# user system elapsed
# 1.242 0.003 1.246
您可以识别连续元素组,然后识别每个元素的平均值和值:
(new.group <- c(1, diff(DT$x) != 0))
# [1] 1 0 0 1 0 1 0 1 0 0
DT[, list(x = x[1L], my = mean(y)), by = list(indx = cumsum(new.group))]
# indx x my
# 1: 1 19 54
# 2: 2 21 38
# 3: 3 19 47
# 4: 4 22 57
with(rle(DT$x),data.frame(x=values,my=tapply(DT$y,rep(1:length(lengths),lengths),mean)));
## x my
## 1 19 54
## 2 21 38
## 3 19 47
## 4 22 57
我的数据如下:
require(data.table)
DT <- data.table(x=c(19,19,19,21,21,19,19,22,22,22),
y=c(53,54,55,32,44,45,49,56,57,58))
我想沿着 x 搜索,并计算 y 的均值。 但是,当使用.
DT[, .(my=mean(y)), by=.(x)]
我得到了 x 的重合值的总体均值。 我想沿着 x 搜索,每次 x 变化时,我都想计算一个新的均值。对于提供的示例,输出将是:
DTans <- data.table(x=c(19,21,19,22),
my=c(54,38,47,57))
我们可以使用 rleid
创建另一个分组变量,获取 'y' 的 mean
,并将 'indx' 分配给 NULL
library(data.table) # v 1.9.5+
DT[, .(my = mean(y)), by = .(indx = rleid(x), x)][, indx := NULL]
# x my
#1: 19 54
#2: 21 38
#3: 19 47
#4: 22 57
基准
set.seed(24)
foo <- function(x) sample(x, 1e7L, replace = TRUE)
DT <- data.table(x = foo(100L), y = foo(10000L))
josilber <- function() {
new.group <- c(1, diff(DT$x) != 0)
res <- data.table(x = DT$x[new.group == 1],
my = tapply(DT$y, cumsum(new.group), mean))
}
Roland <- function() {
DT[, .(my = mean(y), x = x[1]), by = cumsum(c(1, diff(x) != 0))]
}
akrun <- function() {
DT[, .(my = mean(y)), by = .(indx = rleid(x), x)][,indx := NULL]
}
bgoldst <- function() {
with(rle(DT$x), data.frame(x = values,
my = tapply(DT$y, rep(1:length(lengths), lengths), mean)))
}
system.time(josilber())
# user system elapsed
#159.405 1.759 161.110
system.time(bgoldst())
# user system elapsed
#162.628 0.782 163.380
system.time(Roland())
# user system elapsed
# 18.633 0.052 18.678
system.time(akrun())
# user system elapsed
# 1.242 0.003 1.246
您可以识别连续元素组,然后识别每个元素的平均值和值:
(new.group <- c(1, diff(DT$x) != 0))
# [1] 1 0 0 1 0 1 0 1 0 0
DT[, list(x = x[1L], my = mean(y)), by = list(indx = cumsum(new.group))]
# indx x my
# 1: 1 19 54
# 2: 2 21 38
# 3: 3 19 47
# 4: 4 22 57
with(rle(DT$x),data.frame(x=values,my=tapply(DT$y,rep(1:length(lengths),lengths),mean)));
## x my
## 1 19 54
## 2 21 38
## 3 19 47
## 4 22 57