如何有条件地检查和替换 xts 对象中的数据?
How to conditionally check and replace data in xts object?
这是一个可重现的数据集。问题是在一系列 NA 之间找到 1 或 2 个连续的非 NA 值并将它们分配为 NA。如果超过 2 个,则无需执行任何操作。
set.seed(55)
data <- rnorm(10)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:9*60
R <- xts(x = data, order.by = dates)
colnames(R) <- "R-factor"
R[c(1, 3, 6, 10)] <- NA
R
输出:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 -1.812376850
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 -1.119221005
2019-03-18 10:34:00 0.001908206
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
预期结果:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
我写了一个带有 for-loop 的函数,它适用于小型数据集,但速度非常慢。原始数据由100,000+个数据点组成,这个函数在超过10分钟后无法执行
任何人都可以帮助我避免循环以使其更快吗?
我猜,周围有更优雅的解决方案,但这将时间缩短了一半
R_df=as.data.frame(R)
R_df$shift_1=c(R_df$`R-factor`[-1],NA) #shift value one up
R_df$shift_2=c(NA,R_df$`R-factor`[-nrow(R_df)]) #shift value one down
# create new filtered variable
R_df$`R-factor_new`=ifelse(is.na(R_df$`R-factor`),NA,
ifelse((!is.na(R_df$shift_1))|(!is.na(R_df$shift_2)),
R_df$`R-factor`,NA)
> test replications elapsed relative user.self sys.self user.child sys.child
> 2 ifelseapproach 1000 0.83 1.000 0.65 0.19 NA NA
> 1 original 1000 1.81 2.181 1.76 0.01 NA NA
也许可以根据
试试这个
library(tidyverse)
set.seed(55)
x <- 100000
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
the_operation <- . %>%
mutate(lengths_na =lengths_na %>% seq_along %>% rep(lengths_na)) %>%
group_by(lengths_na) %>%
add_tally() %>%
ungroup() %>%
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
microbenchmark::microbenchmark(time_table %>% the_operation)
效果还不错
Unit: milliseconds
expr min lq mean median uq max neval
time_table %>% the_operation 141.9009 176.2988 203.3744 190.183 214.1691 412.3161 100
也许这样读起来更简单
library(tidyverse)
set.seed(55)
# Create the data
x <- 100
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
# Fake some na's
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
# The rle function counts the occurrences of the same value in a vector,
# We create a T/F vector using is.na function
# meaning that we can count the lenght of sequences with or without na's
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
# This operation here can be done outside of the df
new_col <- lengths_na %>%
seq_along %>% # Counts to the size of this vector
rep(lengths_na) # Reps the lengths of the sequences populating the vector
result <- time_table %>%
mutate(new_col =new_col) %>%
group_by(new_col) %>% # Operates the logic on this group look into the tidyverse
add_tally() %>% # Counts how many instance there are on each group
ungroup() %>% # Not actually needed but good manners
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
创建一个函数 Fillin
如果长度小于或等于 3,则 returns NA(如果第一个元素不是 NA,则为 2,这样我们就可以处理第一组,即使它不以 NA 开头),否则 returns 它的论点。使用 cumsum
对运行进行分组并将 Fillin
应用于每个组。
Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
给予:
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
性能
此解决方案的运行速度与使用 rle
的速度大致相同。
library(microbenchmark)
microbenchmark(
Fill = { Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
},
RLrep = { rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )
}
)
给予:
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 490.9 509.5 626.550 527.7 596.45 3411.1 100 a
RLrep 523.5 540.8 604.061 550.8 592.00 1244.4 100 a
这可能比提供的大多数其他解决方案都要快。 rep
函数本质上是 rle
函数的反函数。它采用两个向量参数,并将第一个值的计数扩展到第二个的长度,这允许基于运行长度进行测试,然后用 is.na <-
替换。实际上有两个不同的函数:rle(x) returns 一个长度为 (x) 的逻辑向量,然后 is.na(x)<-
根据 x 中的逻辑值将 NA 分配给 x 中的项目该函数右侧的向量。:
rleR <- rle(c(is.na(R[,1]))) #get the position and lengths of nonNA's and NA's
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) ) #set NAs
#--------------
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
Warning message:
timezone of object (CET) is different than current timezone ().
microbenchmark(
Fill = {Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
ave(R, cumsum(is.na(R)), FUN = Fillin)},
RLrep = {rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )})
#----------------------
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 1668.788 1784.6275 1942.5261 1844.5825 2005.0960 4911.762 100 b
RLrep 102.174 113.9565 144.3477 131.4735 156.6715 368.665 100 a
这是一个可重现的数据集。问题是在一系列 NA 之间找到 1 或 2 个连续的非 NA 值并将它们分配为 NA。如果超过 2 个,则无需执行任何操作。
set.seed(55)
data <- rnorm(10)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:9*60
R <- xts(x = data, order.by = dates)
colnames(R) <- "R-factor"
R[c(1, 3, 6, 10)] <- NA
R
输出:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 -1.812376850
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 -1.119221005
2019-03-18 10:34:00 0.001908206
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
预期结果:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
我写了一个带有 for-loop 的函数,它适用于小型数据集,但速度非常慢。原始数据由100,000+个数据点组成,这个函数在超过10分钟后无法执行
任何人都可以帮助我避免循环以使其更快吗?
我猜,周围有更优雅的解决方案,但这将时间缩短了一半
R_df=as.data.frame(R)
R_df$shift_1=c(R_df$`R-factor`[-1],NA) #shift value one up
R_df$shift_2=c(NA,R_df$`R-factor`[-nrow(R_df)]) #shift value one down
# create new filtered variable
R_df$`R-factor_new`=ifelse(is.na(R_df$`R-factor`),NA,
ifelse((!is.na(R_df$shift_1))|(!is.na(R_df$shift_2)),
R_df$`R-factor`,NA)
> test replications elapsed relative user.self sys.self user.child sys.child
> 2 ifelseapproach 1000 0.83 1.000 0.65 0.19 NA NA
> 1 original 1000 1.81 2.181 1.76 0.01 NA NA
也许可以根据
library(tidyverse)
set.seed(55)
x <- 100000
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
the_operation <- . %>%
mutate(lengths_na =lengths_na %>% seq_along %>% rep(lengths_na)) %>%
group_by(lengths_na) %>%
add_tally() %>%
ungroup() %>%
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
microbenchmark::microbenchmark(time_table %>% the_operation)
效果还不错
Unit: milliseconds
expr min lq mean median uq max neval
time_table %>% the_operation 141.9009 176.2988 203.3744 190.183 214.1691 412.3161 100
也许这样读起来更简单
library(tidyverse)
set.seed(55)
# Create the data
x <- 100
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
# Fake some na's
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
# The rle function counts the occurrences of the same value in a vector,
# We create a T/F vector using is.na function
# meaning that we can count the lenght of sequences with or without na's
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
# This operation here can be done outside of the df
new_col <- lengths_na %>%
seq_along %>% # Counts to the size of this vector
rep(lengths_na) # Reps the lengths of the sequences populating the vector
result <- time_table %>%
mutate(new_col =new_col) %>%
group_by(new_col) %>% # Operates the logic on this group look into the tidyverse
add_tally() %>% # Counts how many instance there are on each group
ungroup() %>% # Not actually needed but good manners
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
创建一个函数 Fillin
如果长度小于或等于 3,则 returns NA(如果第一个元素不是 NA,则为 2,这样我们就可以处理第一组,即使它不以 NA 开头),否则 returns 它的论点。使用 cumsum
对运行进行分组并将 Fillin
应用于每个组。
Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
给予:
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
性能
此解决方案的运行速度与使用 rle
的速度大致相同。
library(microbenchmark)
microbenchmark(
Fill = { Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
},
RLrep = { rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )
}
)
给予:
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 490.9 509.5 626.550 527.7 596.45 3411.1 100 a
RLrep 523.5 540.8 604.061 550.8 592.00 1244.4 100 a
这可能比提供的大多数其他解决方案都要快。 rep
函数本质上是 rle
函数的反函数。它采用两个向量参数,并将第一个值的计数扩展到第二个的长度,这允许基于运行长度进行测试,然后用 is.na <-
替换。实际上有两个不同的函数:rle(x) returns 一个长度为 (x) 的逻辑向量,然后 is.na(x)<-
根据 x 中的逻辑值将 NA 分配给 x 中的项目该函数右侧的向量。:
rleR <- rle(c(is.na(R[,1]))) #get the position and lengths of nonNA's and NA's
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) ) #set NAs
#--------------
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
Warning message:
timezone of object (CET) is different than current timezone ().
microbenchmark(
Fill = {Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
ave(R, cumsum(is.na(R)), FUN = Fillin)},
RLrep = {rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )})
#----------------------
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 1668.788 1784.6275 1942.5261 1844.5825 2005.0960 4911.762 100 b
RLrep 102.174 113.9565 144.3477 131.4735 156.6715 368.665 100 a