R:删除不完整的数据序列
R: Delete incomplete data sequences
我已经记录了通过三个阀门进入一台测量机的三个土壤 CO2 采样点的测量值。每个阀门有三个测量值(来自每个采样点)。这台机器是太阳能供电的,所以在测量过程中有很多电力损失。当电源恢复时,阀门 1 和测量 1 再次开始测量。有时,但并非总是如此(例如第 27 行),有一排电池。巴特。巴特。标志(显示空电池)。我只想选择从 1. 到 2. valve 具有完美序列的数据。
我有这样的数据:
# Measurement Valve Value
#1 1 1 0.123
#2 2 1 0.124
#3 3 1 0.125
#4 1 2 0.126
#5 2 2 0.127
#6 3 2 0.128
#7 1 3 0.129
#8 2 3 0.13
#9 3 3 0.131
#10 batt. batt. batt.
#11 1 1 0.132
#12 2 1 0.133
#13 3 1 0.134
#14 1 2 0.135
#15 2 2 0.136
#16 3 2 0.137
#17 1 3 0.138
#18 2 3 0.139
#19 bat. bat. bat.
#20 1 1 0.141
#21 bat. bat. bat.
#22 1 1 0.141
#23 2 1 0.142
#24 3 1 0.143
#25 bat. bat. bat.
#26 1 1 0.141
#27 1 1 0.141
#28 2 1 0.142
#29 3 1 0.143
#30 1 2 0.144
#31 2 2 0.145
#32 3 2 0.146
#33 1 3 0.147
#34 2 3 0.148
#35 3 3 0.149
我想获取这样的数据:
# Measurement Valve Value
#1 1 1 0.123
#2 2 1 0.124
#3 3 1 0.125
#4 1 2 0.126
#5 2 2 0.127
#6 3 2 0.128
#7 1 1 0.132
#8 2 1 0.133
#9 3 1 0.134
#10 1 2 0.135
#11 2 2 0.136
#12 3 2 0.137
#13 1 1 0.141
#14 2 1 0.142
#15 3 1 0.143
#16 1 2 0.144
#17 2 2 0.145
#18 3 2 0.146
输入数据:
DF = structure(list(Measurement = c("1", "2", "3", "1", "2", "3",
"1", "2", "3", "batt.", "1", "2", "3", "1", "2", "3", "1", "2",
"bat.", "1", "bat.", "1", "2", "3", "bat.", "1", "1", "2", "3",
"1", "2", "3", "1", "2", "3"), Valve = c("1", "1", "1", "2",
"2", "2", "3", "3", "3", "batt.", "1", "1", "1", "2", "2", "2",
"3", "3", "bat.", "1", "bat.", "1", "1", "1", "bat.", "1", "1",
"1", "1", "2", "2", "2", "3", "3", "3"), Value = c("0.123", "0.124",
"0.125", "0.126", "0.127", "0.128", "0.129", "0.13", "0.131",
"batt.", "0.132", "0.133", "0.134", "0.135", "0.136", "0.137",
"0.138", "0.139", "bat.", "0.141", "bat.", "0.141", "0.142",
"0.143", "bat.", "0.141", "0.141", "0.142", "0.143", "0.144",
"0.145", "0.146", "0.147", "0.148", "0.149")), .Names = c("Measurement",
"Valve", "Value"), row.names = c(NA, -35L), class = "data.frame")
那么,您可以创建一个分组列:
library(data.table)
setDT(DF)
DF[, g := cumsum(grepl("bat",DF$Value))]
Measurement Valve Value g
1: 1 1 0.123 0
2: 2 1 0.124 0
3: 3 1 0.125 0
4: 1 2 0.126 0
5: 2 2 0.127 0
6: 3 2 0.128 0
7: 1 3 0.129 0
8: 2 3 0.13 0
9: 3 3 0.131 0
10: batt. batt. batt. 1
11: 1 1 0.132 1
12: 2 1 0.133 1
13: 3 1 0.134 1
14: 1 2 0.135 1
15: 2 2 0.136 1
16: 3 2 0.137 1
17: 1 3 0.138 1
18: 2 3 0.139 1
19: bat. bat. bat. 2
20: 1 1 0.141 2
21: bat. bat. bat. 3
22: 1 1 0.141 3
23: 2 1 0.142 3
24: 3 1 0.143 3
25: bat. bat. bat. 4
26: 1 1 0.141 4
27: 1 1 0.141 4
28: 2 1 0.142 4
29: 3 1 0.143 4
30: 1 2 0.144 4
31: 2 2 0.145 4
32: 3 2 0.146 4
33: 1 3 0.147 4
34: 2 3 0.148 4
35: 3 3 0.149 4
Measurement Valve Value g
然后 select 组
DF2 <- DF[, if (all(c("1","2") %in% Valve)) unique(.SD[Valve %in% c("1","2")]), by = g]
g Measurement Valve Value
1: 0 1 1 0.123
2: 0 2 1 0.124
3: 0 3 1 0.125
4: 0 1 2 0.126
5: 0 2 2 0.127
6: 0 3 2 0.128
7: 1 1 1 0.132
8: 1 2 1 0.133
9: 1 3 1 0.134
10: 1 1 2 0.135
11: 1 2 2 0.136
12: 1 3 2 0.137
13: 4 1 1 0.141
14: 4 2 1 0.142
15: 4 3 1 0.143
16: 4 1 2 0.144
17: 4 2 2 0.145
18: 4 3 2 0.146
如果第 26 行和第 27 行可能有不同的值,您可以 select 第一个 unique(.SD[Valve %in% c("1","2")], by=c("Measurement", "Valve"))
。
我已经记录了通过三个阀门进入一台测量机的三个土壤 CO2 采样点的测量值。每个阀门有三个测量值(来自每个采样点)。这台机器是太阳能供电的,所以在测量过程中有很多电力损失。当电源恢复时,阀门 1 和测量 1 再次开始测量。有时,但并非总是如此(例如第 27 行),有一排电池。巴特。巴特。标志(显示空电池)。我只想选择从 1. 到 2. valve 具有完美序列的数据。
我有这样的数据:
# Measurement Valve Value
#1 1 1 0.123
#2 2 1 0.124
#3 3 1 0.125
#4 1 2 0.126
#5 2 2 0.127
#6 3 2 0.128
#7 1 3 0.129
#8 2 3 0.13
#9 3 3 0.131
#10 batt. batt. batt.
#11 1 1 0.132
#12 2 1 0.133
#13 3 1 0.134
#14 1 2 0.135
#15 2 2 0.136
#16 3 2 0.137
#17 1 3 0.138
#18 2 3 0.139
#19 bat. bat. bat.
#20 1 1 0.141
#21 bat. bat. bat.
#22 1 1 0.141
#23 2 1 0.142
#24 3 1 0.143
#25 bat. bat. bat.
#26 1 1 0.141
#27 1 1 0.141
#28 2 1 0.142
#29 3 1 0.143
#30 1 2 0.144
#31 2 2 0.145
#32 3 2 0.146
#33 1 3 0.147
#34 2 3 0.148
#35 3 3 0.149
我想获取这样的数据:
# Measurement Valve Value
#1 1 1 0.123
#2 2 1 0.124
#3 3 1 0.125
#4 1 2 0.126
#5 2 2 0.127
#6 3 2 0.128
#7 1 1 0.132
#8 2 1 0.133
#9 3 1 0.134
#10 1 2 0.135
#11 2 2 0.136
#12 3 2 0.137
#13 1 1 0.141
#14 2 1 0.142
#15 3 1 0.143
#16 1 2 0.144
#17 2 2 0.145
#18 3 2 0.146
输入数据:
DF = structure(list(Measurement = c("1", "2", "3", "1", "2", "3",
"1", "2", "3", "batt.", "1", "2", "3", "1", "2", "3", "1", "2",
"bat.", "1", "bat.", "1", "2", "3", "bat.", "1", "1", "2", "3",
"1", "2", "3", "1", "2", "3"), Valve = c("1", "1", "1", "2",
"2", "2", "3", "3", "3", "batt.", "1", "1", "1", "2", "2", "2",
"3", "3", "bat.", "1", "bat.", "1", "1", "1", "bat.", "1", "1",
"1", "1", "2", "2", "2", "3", "3", "3"), Value = c("0.123", "0.124",
"0.125", "0.126", "0.127", "0.128", "0.129", "0.13", "0.131",
"batt.", "0.132", "0.133", "0.134", "0.135", "0.136", "0.137",
"0.138", "0.139", "bat.", "0.141", "bat.", "0.141", "0.142",
"0.143", "bat.", "0.141", "0.141", "0.142", "0.143", "0.144",
"0.145", "0.146", "0.147", "0.148", "0.149")), .Names = c("Measurement",
"Valve", "Value"), row.names = c(NA, -35L), class = "data.frame")
那么,您可以创建一个分组列:
library(data.table)
setDT(DF)
DF[, g := cumsum(grepl("bat",DF$Value))]
Measurement Valve Value g
1: 1 1 0.123 0
2: 2 1 0.124 0
3: 3 1 0.125 0
4: 1 2 0.126 0
5: 2 2 0.127 0
6: 3 2 0.128 0
7: 1 3 0.129 0
8: 2 3 0.13 0
9: 3 3 0.131 0
10: batt. batt. batt. 1
11: 1 1 0.132 1
12: 2 1 0.133 1
13: 3 1 0.134 1
14: 1 2 0.135 1
15: 2 2 0.136 1
16: 3 2 0.137 1
17: 1 3 0.138 1
18: 2 3 0.139 1
19: bat. bat. bat. 2
20: 1 1 0.141 2
21: bat. bat. bat. 3
22: 1 1 0.141 3
23: 2 1 0.142 3
24: 3 1 0.143 3
25: bat. bat. bat. 4
26: 1 1 0.141 4
27: 1 1 0.141 4
28: 2 1 0.142 4
29: 3 1 0.143 4
30: 1 2 0.144 4
31: 2 2 0.145 4
32: 3 2 0.146 4
33: 1 3 0.147 4
34: 2 3 0.148 4
35: 3 3 0.149 4
Measurement Valve Value g
然后 select 组
DF2 <- DF[, if (all(c("1","2") %in% Valve)) unique(.SD[Valve %in% c("1","2")]), by = g]
g Measurement Valve Value
1: 0 1 1 0.123
2: 0 2 1 0.124
3: 0 3 1 0.125
4: 0 1 2 0.126
5: 0 2 2 0.127
6: 0 3 2 0.128
7: 1 1 1 0.132
8: 1 2 1 0.133
9: 1 3 1 0.134
10: 1 1 2 0.135
11: 1 2 2 0.136
12: 1 3 2 0.137
13: 4 1 1 0.141
14: 4 2 1 0.142
15: 4 3 1 0.143
16: 4 1 2 0.144
17: 4 2 2 0.145
18: 4 3 2 0.146
如果第 26 行和第 27 行可能有不同的值,您可以 select 第一个 unique(.SD[Valve %in% c("1","2")], by=c("Measurement", "Valve"))
。