如何创建一个新变量,一旦达到序列中的最大值,它就会保留最大值?
How to create a new variable that retains the maximum value once it is reached in a sequence?
我有这个示例数据集:
library(dplyr)
library(ggplot2)
data.1 <-read.csv(text = "
sampling_date,value
3/4/2021,5
3/5/2021,5.181818182
3/6/2021,5.363636364
3/7/2021,5.545454545
3/8/2021,5.727272727
3/9/2021,5.909090909
3/10/2021,6.090909091
3/11/2021,6.272727273
3/12/2021,6.454545455
3/13/2021,6.636363636
3/14/2021,6.818181818
3/15/2021,7
3/16/2021,7.047619048
3/17/2021,7.095238095
3/18/2021,7.142857143
3/19/2021,7.19047619
3/20/2021,7.238095238
3/21/2021,7.285714286
3/22/2021,7.333333333
3/23/2021,7.380952381
3/24/2021,7.428571429
3/25/2021,7.476190476
3/26/2021,7.523809524
3/27/2021,7.571428571
3/28/2021,7.619047619
3/29/2021,7.666666667
3/30/2021,7.714285714
3/31/2021,7.761904762
4/1/2021,7.80952381
4/2/2021,7.857142857
4/3/2021,7.904761905
4/4/2021,7.952380952
4/5/2021,8
4/6/2021,8
4/7/2021,8
4/8/2021,7.9
4/9/2021,7.8
4/10/2021,7.7
4/11/2021,7.6
4/12/2021,7.5
4/13/2021,7.4
4/14/2021,7.3
4/15/2021,7.2
4/16/2021,7.1
4/17/2021,7
4/18/2021,7.090909091
4/19/2021,7.181818182
4/20/2021,7.272727273
4/21/2021,7.363636364
4/22/2021,7.454545455
4/23/2021,7.545454545
4/24/2021,7.636363636
4/25/2021,7.727272727
4/26/2021,7.818181818
4/27/2021,7.909090909
4/28/2021,8
4/29/2021,8
4/30/2021,8
5/1/2021,8
5/2/2021,8
") %>% mutate(sampling_date = as.Date(sampling_date, format = '%m/%d/%Y'))
该图显示它达到最大值,然后下降,然后再次增加。
ggplot(data.1)+
geom_point(aes(x=sampling_date, y=value), shape = 0.5)
这对我正在研究的过程来说有些异常(一旦达到最大值就不能减少),所以我想用最大值替换达到最大值后的值。这发生在某些样本中,但不会发生在其他样本中,因此如果达到最大值并一直保持到序列结束,则不应进行任何更改。
我需要的输出是这样的:
data.2 <-read.csv(text = "
sampling_date,value,value2
3/4/2021,5,5
3/5/2021,5.181818182,5.181818182
3/6/2021,5.363636364,5.363636364
3/7/2021,5.545454545,5.545454545
3/8/2021,5.727272727,5.727272727
3/9/2021,5.909090909,5.909090909
3/10/2021,6.090909091,6.090909091
3/11/2021,6.272727273,6.272727273
3/12/2021,6.454545455,6.454545455
3/13/2021,6.636363636,6.636363636
3/14/2021,6.818181818,6.818181818
3/15/2021,7,7
3/16/2021,7.047619048,7.047619048
3/17/2021,7.095238095,7.095238095
3/18/2021,7.142857143,7.142857143
3/19/2021,7.19047619,7.19047619
3/20/2021,7.238095238,7.238095238
3/21/2021,7.285714286,7.285714286
3/22/2021,7.333333333,7.333333333
3/23/2021,7.380952381,7.380952381
3/24/2021,7.428571429,7.428571429
3/25/2021,7.476190476,7.476190476
3/26/2021,7.523809524,7.523809524
3/27/2021,7.571428571,7.571428571
3/28/2021,7.619047619,7.619047619
3/29/2021,7.666666667,7.666666667
3/30/2021,7.714285714,7.714285714
3/31/2021,7.761904762,7.761904762
4/1/2021,7.80952381,7.80952381
4/2/2021,7.857142857,7.857142857
4/3/2021,7.904761905,7.904761905
4/4/2021,7.952380952,7.952380952
4/5/2021,8,8
4/6/2021,8,8
4/7/2021,8,8
4/8/2021,7.9,8
4/9/2021,7.8,8
4/10/2021,7.7,8
4/11/2021,7.6,8
4/12/2021,7.5,8
4/13/2021,7.4,8
4/14/2021,7.3,8
4/15/2021,7.2,8
4/16/2021,7.1,8
4/17/2021,7,8
4/18/2021,7.090909091,8
4/19/2021,7.181818182,8
4/20/2021,7.272727273,8
4/21/2021,7.363636364,8
4/22/2021,7.454545455,8
4/23/2021,7.545454545,8
4/24/2021,7.636363636,8
4/25/2021,7.727272727,8
4/26/2021,7.818181818,8
4/27/2021,7.909090909,8
4/28/2021,8,8
4/29/2021,8,8
4/30/2021,8,8
5/1/2021,8,8
5/2/2021,8,8
") %>% mutate(sampling_date = as.Date(sampling_date, format = '%m/%d/%Y'))
而两组的剧情如下(红色是我想要的顺序)
ggplot(data.1)+
geom_point(aes(x=sampling_date, y=value), shape = 0.5) +
geom_point(aes(x=sampling_date, y=value2), data = data.2, shape = 1, color = "red", alpha=0.5)
这是我尝试过的方法,但它创建的新变量全部具有最大值,即 8。
data.1.new.version <- data.1 %>%
mutate(value2 = if_else(sampling_date[]>which.max(data.1[,2]), max(value), value))
如有任何帮助,我们将不胜感激。
我想你正在寻找 cummax
(cumulative maximum) 在基础 R:
data.1 %>%
mutate(value = cummax(value)) %>%
ggplot(aes(sampling_date, value)) +
geom_point()
我有这个示例数据集:
library(dplyr)
library(ggplot2)
data.1 <-read.csv(text = "
sampling_date,value
3/4/2021,5
3/5/2021,5.181818182
3/6/2021,5.363636364
3/7/2021,5.545454545
3/8/2021,5.727272727
3/9/2021,5.909090909
3/10/2021,6.090909091
3/11/2021,6.272727273
3/12/2021,6.454545455
3/13/2021,6.636363636
3/14/2021,6.818181818
3/15/2021,7
3/16/2021,7.047619048
3/17/2021,7.095238095
3/18/2021,7.142857143
3/19/2021,7.19047619
3/20/2021,7.238095238
3/21/2021,7.285714286
3/22/2021,7.333333333
3/23/2021,7.380952381
3/24/2021,7.428571429
3/25/2021,7.476190476
3/26/2021,7.523809524
3/27/2021,7.571428571
3/28/2021,7.619047619
3/29/2021,7.666666667
3/30/2021,7.714285714
3/31/2021,7.761904762
4/1/2021,7.80952381
4/2/2021,7.857142857
4/3/2021,7.904761905
4/4/2021,7.952380952
4/5/2021,8
4/6/2021,8
4/7/2021,8
4/8/2021,7.9
4/9/2021,7.8
4/10/2021,7.7
4/11/2021,7.6
4/12/2021,7.5
4/13/2021,7.4
4/14/2021,7.3
4/15/2021,7.2
4/16/2021,7.1
4/17/2021,7
4/18/2021,7.090909091
4/19/2021,7.181818182
4/20/2021,7.272727273
4/21/2021,7.363636364
4/22/2021,7.454545455
4/23/2021,7.545454545
4/24/2021,7.636363636
4/25/2021,7.727272727
4/26/2021,7.818181818
4/27/2021,7.909090909
4/28/2021,8
4/29/2021,8
4/30/2021,8
5/1/2021,8
5/2/2021,8
") %>% mutate(sampling_date = as.Date(sampling_date, format = '%m/%d/%Y'))
该图显示它达到最大值,然后下降,然后再次增加。
ggplot(data.1)+
geom_point(aes(x=sampling_date, y=value), shape = 0.5)
这对我正在研究的过程来说有些异常(一旦达到最大值就不能减少),所以我想用最大值替换达到最大值后的值。这发生在某些样本中,但不会发生在其他样本中,因此如果达到最大值并一直保持到序列结束,则不应进行任何更改。
我需要的输出是这样的:
data.2 <-read.csv(text = "
sampling_date,value,value2
3/4/2021,5,5
3/5/2021,5.181818182,5.181818182
3/6/2021,5.363636364,5.363636364
3/7/2021,5.545454545,5.545454545
3/8/2021,5.727272727,5.727272727
3/9/2021,5.909090909,5.909090909
3/10/2021,6.090909091,6.090909091
3/11/2021,6.272727273,6.272727273
3/12/2021,6.454545455,6.454545455
3/13/2021,6.636363636,6.636363636
3/14/2021,6.818181818,6.818181818
3/15/2021,7,7
3/16/2021,7.047619048,7.047619048
3/17/2021,7.095238095,7.095238095
3/18/2021,7.142857143,7.142857143
3/19/2021,7.19047619,7.19047619
3/20/2021,7.238095238,7.238095238
3/21/2021,7.285714286,7.285714286
3/22/2021,7.333333333,7.333333333
3/23/2021,7.380952381,7.380952381
3/24/2021,7.428571429,7.428571429
3/25/2021,7.476190476,7.476190476
3/26/2021,7.523809524,7.523809524
3/27/2021,7.571428571,7.571428571
3/28/2021,7.619047619,7.619047619
3/29/2021,7.666666667,7.666666667
3/30/2021,7.714285714,7.714285714
3/31/2021,7.761904762,7.761904762
4/1/2021,7.80952381,7.80952381
4/2/2021,7.857142857,7.857142857
4/3/2021,7.904761905,7.904761905
4/4/2021,7.952380952,7.952380952
4/5/2021,8,8
4/6/2021,8,8
4/7/2021,8,8
4/8/2021,7.9,8
4/9/2021,7.8,8
4/10/2021,7.7,8
4/11/2021,7.6,8
4/12/2021,7.5,8
4/13/2021,7.4,8
4/14/2021,7.3,8
4/15/2021,7.2,8
4/16/2021,7.1,8
4/17/2021,7,8
4/18/2021,7.090909091,8
4/19/2021,7.181818182,8
4/20/2021,7.272727273,8
4/21/2021,7.363636364,8
4/22/2021,7.454545455,8
4/23/2021,7.545454545,8
4/24/2021,7.636363636,8
4/25/2021,7.727272727,8
4/26/2021,7.818181818,8
4/27/2021,7.909090909,8
4/28/2021,8,8
4/29/2021,8,8
4/30/2021,8,8
5/1/2021,8,8
5/2/2021,8,8
") %>% mutate(sampling_date = as.Date(sampling_date, format = '%m/%d/%Y'))
而两组的剧情如下(红色是我想要的顺序)
ggplot(data.1)+
geom_point(aes(x=sampling_date, y=value), shape = 0.5) +
geom_point(aes(x=sampling_date, y=value2), data = data.2, shape = 1, color = "red", alpha=0.5)
这是我尝试过的方法,但它创建的新变量全部具有最大值,即 8。
data.1.new.version <- data.1 %>%
mutate(value2 = if_else(sampling_date[]>which.max(data.1[,2]), max(value), value))
如有任何帮助,我们将不胜感激。
我想你正在寻找 cummax
(cumulative maximum) 在基础 R:
data.1 %>%
mutate(value = cummax(value)) %>%
ggplot(aes(sampling_date, value)) +
geom_point()