如何从两个特定值之间的某个范围内计算平均值?
How to count average from a certain range between two specific values?
我有一个数据框列表,想从两个值之间的某个范围内计算平均值和标准差。例如:
t value Z
0 27.21666 0.473294 <NA>
1 27.31775 0.484845 {"type":"M","msg":"FIRST","time":27.2498,"dist":0.410454}
2 27.41881 0.457187 <NA>
3 27.51916 0.429909 <NA>
4 27.62018 0.401957 <NA>
5 27.72040 0.373661 <NA>
6 27.82057 0.346553 <NA>
7 27.92117 0.319149 <NA>
8 28.02310 0.291235 <NA>
9 28.12421 0.264531 <NA>
10 28.22576 0.237448 <NA>
11 28.32797 0.210861 <NA>
12 28.42993 0.183152 <NA>
13 28.53069 0.156354 <NA>
14 28.63218 0.128166 <NA>
15 28.73285 0.100545 <NA>
16 28.83326 0.072625 <NA>
17 28.93670 0.042560 <NA>
18 29.03824 0.012583 <NA>
19 29.13856 -0.018004 <NA>
20 29.23870 -0.049970 <NA>
21 29.33917 -0.081763 <NA>
22 29.44034 -0.113793 <NA>
23 29.54193 -0.148269 <NA>
24 29.64338 -0.182154 <NA>
25 29.74523 -0.216842 <NA>
26 29.84533 -0.252688 <NA>
27 29.94582 -0.289105 <NA>
28 30.04596 -0.325591 <NA>
29 30.14973 -0.365179 <NA>
30 30.24985 -0.403439 <NA>
31 30.35072 -0.444497 <NA>
32 30.45133 -0.485201 <NA>
33 30.55292 -0.528480 <NA>
34 30.65318 -0.572731 <NA>
35 30.75435 -0.619512 <NA>
36 30.85619 -0.667557 <NA>
37 30.95747 -0.717658 <NA>
38 31.06031 -0.769489 <NA>
39 31.16094 -0.823372 <NA>
40 31.26099 -0.876394 {"type":"M","msg":"LAST0","time":31.2458,"dist":0.521487}
41 31.36135 -0.932901 <NA>
42 31.46322 -0.991948 <NA>
43 31.56358 -1.049800 <NA>
44 31.66395 -1.109068 <NA>
45 31.76431 -1.166309 <NA>
46 31.86441 -1.219613 <NA>
47 31.96495 -1.266242 <NA>
48 32.06650 -1.304891 <NA>
49 32.16776 -1.331460 <NA>
50 32.26796 -1.344302 <NA>
我想从起点(= Z 列的值为“FIRST”的点)和端点( = Z 列中包含“LAST0”值的点)。我怎么能在 R 中做到这一点?另外,我怎么能简单地 delete/filter 输出 Z 列中值“FIRST”之前和值“LAST0”之后带有“NA”的所有行?
作为奖励,如果在一个数据框中引入了多个相似的值“FIRST”和“LAST0”序列,我如何才能对所有不同的实例执行此操作?
编辑:在下面添加了一个 dput()
> dput (tbl[[1]])
structure(list(t = c(27.216666, 27.317755, 27.418812, 27.519156, 27.620184,
27.720402, 27.820574, 27.921171, 28.023102, 28.124214, 28.225763,
28.327967, 28.429934, 28.530693, 28.632181, 28.732851, 28.83326,
28.936697, 29.038239, 29.138559, 29.238697, 29.339165, 29.440336,
29.541931, 29.643377, 29.745228, 29.845327, 29.945824, 30.045958,
30.149729, 30.249853, 30.350716, 30.451326, 30.552917, 30.653179,
30.754349, 30.856194, 30.957468, 31.06031, 31.16094, 31.260986,
31.361355, 31.463219, 31.563583, 31.663948, 31.764309, 31.864408,
31.964945, 32.066498, 32.167763, 32.267956, 32.368561, 32.469727,
32.569916, 32.669949, 32.773418, 32.874977, 32.976883, 33.078552,
33.181709, 33.282593, 33.385487, 33.486103, 33.588486, 33.690254,
33.793388, 33.893703, 33.993759), value = c(0.473294, 0.484845, 0.457187,
0.429909, 0.401957, 0.373661, 0.346553, 0.319149, 0.291235, 0.264531,
0.237448, 0.210861, 0.183152, 0.156354, 0.128166, 0.100545, 0.072625,
0.04256, 0.012583, -0.018004, -0.04997, -0.081763, -0.113793,
-0.148269, -0.182154, -0.216842, -0.252688, -0.289105, -0.325591,
-0.365179, -0.403439, -0.444497, -0.485201, -0.52848, -0.572731,
-0.619512, -0.667557, -0.717658, -0.769489, -0.823372, -0.876394,
-0.932901, -0.991948, -1.0498, -1.109068, -1.166309, -1.219613,
-1.266242, -1.304891, -1.33146, -1.344302, -1.343445, -1.328416,
-1.298611, -1.253834, -1.193182, -1.119025, -1.030697, -0.929913,
-0.817493, -0.698029, -0.56889, -0.437583, -0.301298, -0.165372,
-0.028239, 0.102254, 0.22927), Z = c(NA, "{\"type\":\"M\",\"msg\":\"FIRST\",\"time\":27.2498,\"dist\":0.410454}",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "{\"type\":\"M\",\"msg\":\"LAST0\",\"time\":31.2458,\"dist\":0.521487}",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-68L), class = c("tbl_df", "tbl", "data.frame"))
考虑这个数据框:
t value Z
0 27.21666 0.473294 <NA>
1 27.31775 0.484845 {msg:FIRST}
2 27.41881 0.457187 <NA>
3 27.51916 0.429909 <NA>
4 27.62018 0.401957 <NA>
5 27.72040 0.373661 <NA>
6 27.82057 0.346553 <NA>
7 27.92117 0.319149 {msg:LAST0}
32 30.45133 -0.485201 <NA>
33 30.55292 -0.528480 <NA>
34 30.65318 -0.572731 <NA>
35 30.75435 -0.619512 <NA>
36 30.85619 -0.667557 {msg:FIRST}
37 30.95747 -0.717658 <NA>
38 31.06031 -0.769489 <NA>
39 31.16094 -0.823372 <NA>
40 31.26099 -0.876394 {msg:LAST0}
41 31.36135 -0.932901 <NA>
50 32.26796 -1.344302 <NA>
使用 tidyverse 函数,您可以先使用 cumsum 识别 FIRST 和 LAST0 之间的组,然后过滤掉不在这两个值之间的行,然后使用 mutate 计算均值和标准差。
library(dplyr)
df %>%
mutate(cum = cumsum(grepl("FIRST", Z) + lag(grepl("LAST0", Z), default = 0))) %>%
filter(cum %% 2 == 1) %>%
group_by(cum = as.numeric(as.factor(cum))) %>%
mutate(mean = mean(value),
sd = sd(value))
# A tibble: 12 x 6
# Groups: cum [2]
t value Z cum mean sd
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 27.3 0.485 {msg:FIRST} 1 0.402 0.0598
2 27.4 0.457 <NA> 1 0.402 0.0598
3 27.5 0.430 <NA> 1 0.402 0.0598
4 27.6 0.402 <NA> 1 0.402 0.0598
5 27.7 0.374 <NA> 1 0.402 0.0598
6 27.8 0.347 <NA> 1 0.402 0.0598
7 27.9 0.319 {msg:LAST0} 1 0.402 0.0598
8 30.9 -0.668 {msg:FIRST} 2 -0.771 0.0828
9 31.0 -0.718 <NA> 2 -0.771 0.0828
10 31.1 -0.769 <NA> 2 -0.771 0.0828
11 31.2 -0.823 <NA> 2 -0.771 0.0828
12 31.3 -0.876 {msg:LAST0} 2 -0.771 0.0828
数据
structure(list(t = c(27.21666, 27.31775, 27.41881, 27.51916,
27.62018, 27.7204, 27.82057, 27.92117, 30.45133, 30.55292, 30.65318,
30.75435, 30.85619, 30.95747, 31.06031, 31.16094, 31.26099, 31.36135,
32.26796), value = c(0.473294, 0.484845, 0.457187, 0.429909,
0.401957, 0.373661, 0.346553, 0.319149, -0.485201, -0.52848,
-0.572731, -0.619512, -0.667557, -0.717658, -0.769489, -0.823372,
-0.876394, -0.932901, -1.344302), Z = c("<NA>", "{msg:FIRST}",
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "{msg:LAST0}", "<NA>",
"<NA>", "<NA>", "<NA>", "{msg:FIRST}", "<NA>", "<NA>", "<NA>",
"{msg:LAST0}", "<NA>", "<NA>")), class = "data.frame", row.names = c("0",
"1", "2", "3", "4", "5", "6", "7", "32", "33", "34", "35", "36",
"37", "38", "39", "40", "41", "50"))
我有一个数据框列表,想从两个值之间的某个范围内计算平均值和标准差。例如:
t value Z
0 27.21666 0.473294 <NA>
1 27.31775 0.484845 {"type":"M","msg":"FIRST","time":27.2498,"dist":0.410454}
2 27.41881 0.457187 <NA>
3 27.51916 0.429909 <NA>
4 27.62018 0.401957 <NA>
5 27.72040 0.373661 <NA>
6 27.82057 0.346553 <NA>
7 27.92117 0.319149 <NA>
8 28.02310 0.291235 <NA>
9 28.12421 0.264531 <NA>
10 28.22576 0.237448 <NA>
11 28.32797 0.210861 <NA>
12 28.42993 0.183152 <NA>
13 28.53069 0.156354 <NA>
14 28.63218 0.128166 <NA>
15 28.73285 0.100545 <NA>
16 28.83326 0.072625 <NA>
17 28.93670 0.042560 <NA>
18 29.03824 0.012583 <NA>
19 29.13856 -0.018004 <NA>
20 29.23870 -0.049970 <NA>
21 29.33917 -0.081763 <NA>
22 29.44034 -0.113793 <NA>
23 29.54193 -0.148269 <NA>
24 29.64338 -0.182154 <NA>
25 29.74523 -0.216842 <NA>
26 29.84533 -0.252688 <NA>
27 29.94582 -0.289105 <NA>
28 30.04596 -0.325591 <NA>
29 30.14973 -0.365179 <NA>
30 30.24985 -0.403439 <NA>
31 30.35072 -0.444497 <NA>
32 30.45133 -0.485201 <NA>
33 30.55292 -0.528480 <NA>
34 30.65318 -0.572731 <NA>
35 30.75435 -0.619512 <NA>
36 30.85619 -0.667557 <NA>
37 30.95747 -0.717658 <NA>
38 31.06031 -0.769489 <NA>
39 31.16094 -0.823372 <NA>
40 31.26099 -0.876394 {"type":"M","msg":"LAST0","time":31.2458,"dist":0.521487}
41 31.36135 -0.932901 <NA>
42 31.46322 -0.991948 <NA>
43 31.56358 -1.049800 <NA>
44 31.66395 -1.109068 <NA>
45 31.76431 -1.166309 <NA>
46 31.86441 -1.219613 <NA>
47 31.96495 -1.266242 <NA>
48 32.06650 -1.304891 <NA>
49 32.16776 -1.331460 <NA>
50 32.26796 -1.344302 <NA>
我想从起点(= Z 列的值为“FIRST”的点)和端点( = Z 列中包含“LAST0”值的点)。我怎么能在 R 中做到这一点?另外,我怎么能简单地 delete/filter 输出 Z 列中值“FIRST”之前和值“LAST0”之后带有“NA”的所有行?
作为奖励,如果在一个数据框中引入了多个相似的值“FIRST”和“LAST0”序列,我如何才能对所有不同的实例执行此操作?
编辑:在下面添加了一个 dput()
> dput (tbl[[1]])
structure(list(t = c(27.216666, 27.317755, 27.418812, 27.519156, 27.620184,
27.720402, 27.820574, 27.921171, 28.023102, 28.124214, 28.225763,
28.327967, 28.429934, 28.530693, 28.632181, 28.732851, 28.83326,
28.936697, 29.038239, 29.138559, 29.238697, 29.339165, 29.440336,
29.541931, 29.643377, 29.745228, 29.845327, 29.945824, 30.045958,
30.149729, 30.249853, 30.350716, 30.451326, 30.552917, 30.653179,
30.754349, 30.856194, 30.957468, 31.06031, 31.16094, 31.260986,
31.361355, 31.463219, 31.563583, 31.663948, 31.764309, 31.864408,
31.964945, 32.066498, 32.167763, 32.267956, 32.368561, 32.469727,
32.569916, 32.669949, 32.773418, 32.874977, 32.976883, 33.078552,
33.181709, 33.282593, 33.385487, 33.486103, 33.588486, 33.690254,
33.793388, 33.893703, 33.993759), value = c(0.473294, 0.484845, 0.457187,
0.429909, 0.401957, 0.373661, 0.346553, 0.319149, 0.291235, 0.264531,
0.237448, 0.210861, 0.183152, 0.156354, 0.128166, 0.100545, 0.072625,
0.04256, 0.012583, -0.018004, -0.04997, -0.081763, -0.113793,
-0.148269, -0.182154, -0.216842, -0.252688, -0.289105, -0.325591,
-0.365179, -0.403439, -0.444497, -0.485201, -0.52848, -0.572731,
-0.619512, -0.667557, -0.717658, -0.769489, -0.823372, -0.876394,
-0.932901, -0.991948, -1.0498, -1.109068, -1.166309, -1.219613,
-1.266242, -1.304891, -1.33146, -1.344302, -1.343445, -1.328416,
-1.298611, -1.253834, -1.193182, -1.119025, -1.030697, -0.929913,
-0.817493, -0.698029, -0.56889, -0.437583, -0.301298, -0.165372,
-0.028239, 0.102254, 0.22927), Z = c(NA, "{\"type\":\"M\",\"msg\":\"FIRST\",\"time\":27.2498,\"dist\":0.410454}",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "{\"type\":\"M\",\"msg\":\"LAST0\",\"time\":31.2458,\"dist\":0.521487}",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-68L), class = c("tbl_df", "tbl", "data.frame"))
考虑这个数据框:
t value Z
0 27.21666 0.473294 <NA>
1 27.31775 0.484845 {msg:FIRST}
2 27.41881 0.457187 <NA>
3 27.51916 0.429909 <NA>
4 27.62018 0.401957 <NA>
5 27.72040 0.373661 <NA>
6 27.82057 0.346553 <NA>
7 27.92117 0.319149 {msg:LAST0}
32 30.45133 -0.485201 <NA>
33 30.55292 -0.528480 <NA>
34 30.65318 -0.572731 <NA>
35 30.75435 -0.619512 <NA>
36 30.85619 -0.667557 {msg:FIRST}
37 30.95747 -0.717658 <NA>
38 31.06031 -0.769489 <NA>
39 31.16094 -0.823372 <NA>
40 31.26099 -0.876394 {msg:LAST0}
41 31.36135 -0.932901 <NA>
50 32.26796 -1.344302 <NA>
使用 tidyverse 函数,您可以先使用 cumsum 识别 FIRST 和 LAST0 之间的组,然后过滤掉不在这两个值之间的行,然后使用 mutate 计算均值和标准差。
library(dplyr)
df %>%
mutate(cum = cumsum(grepl("FIRST", Z) + lag(grepl("LAST0", Z), default = 0))) %>%
filter(cum %% 2 == 1) %>%
group_by(cum = as.numeric(as.factor(cum))) %>%
mutate(mean = mean(value),
sd = sd(value))
# A tibble: 12 x 6
# Groups: cum [2]
t value Z cum mean sd
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 27.3 0.485 {msg:FIRST} 1 0.402 0.0598
2 27.4 0.457 <NA> 1 0.402 0.0598
3 27.5 0.430 <NA> 1 0.402 0.0598
4 27.6 0.402 <NA> 1 0.402 0.0598
5 27.7 0.374 <NA> 1 0.402 0.0598
6 27.8 0.347 <NA> 1 0.402 0.0598
7 27.9 0.319 {msg:LAST0} 1 0.402 0.0598
8 30.9 -0.668 {msg:FIRST} 2 -0.771 0.0828
9 31.0 -0.718 <NA> 2 -0.771 0.0828
10 31.1 -0.769 <NA> 2 -0.771 0.0828
11 31.2 -0.823 <NA> 2 -0.771 0.0828
12 31.3 -0.876 {msg:LAST0} 2 -0.771 0.0828
数据
structure(list(t = c(27.21666, 27.31775, 27.41881, 27.51916,
27.62018, 27.7204, 27.82057, 27.92117, 30.45133, 30.55292, 30.65318,
30.75435, 30.85619, 30.95747, 31.06031, 31.16094, 31.26099, 31.36135,
32.26796), value = c(0.473294, 0.484845, 0.457187, 0.429909,
0.401957, 0.373661, 0.346553, 0.319149, -0.485201, -0.52848,
-0.572731, -0.619512, -0.667557, -0.717658, -0.769489, -0.823372,
-0.876394, -0.932901, -1.344302), Z = c("<NA>", "{msg:FIRST}",
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "{msg:LAST0}", "<NA>",
"<NA>", "<NA>", "<NA>", "{msg:FIRST}", "<NA>", "<NA>", "<NA>",
"{msg:LAST0}", "<NA>", "<NA>")), class = "data.frame", row.names = c("0",
"1", "2", "3", "4", "5", "6", "7", "32", "33", "34", "35", "36",
"37", "38", "39", "40", "41", "50"))