根据不同的值范围和 NA,使用嵌套的 ifelse 创建序列将给出错误的结果
Creating a Sequence with nested ifelse, depending on different Value ranges and NAs, will give wrong result
我有一个这样的数据框:
time Value Seq.Count
1 0 0
2 0 0
3 3 0
4 4 0
5 10 0
6 10 0
7 10 0
8 7 0
9 6 0
10 NA 0
11 NA 0
12 NA 0
13 0 0
14 0 0
15 0 0
现在我想让 "Seq.Count" 列加一,每次 "Value" 列中的数字 X 在以下任一值之间变化时
0 == X, 0 < X > 10, X == 10, X == NA
所以我想最后得到类似下面的东西:
time Value Seq.Count
1 0 0
2 0 0
3 3 1
4 4 1
5 10 2
6 10 2
7 10 2
8 7 3
9 6 3
10 NA 4
11 NA 4
12 NA 4
13 0 5
14 0 5
15 0 5
我写了这段代码:
for (i in 2:nrow(df)) {
df$Seq.Count[i] <- ifelse(df$Value[i] == 10,
ifelse(df$Value[(i-1)] != 10, df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
ifelse(df$Value[i] == 0,
ifelse(df$Value[(i-1)] != 0, df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
ifelse(between(df$Value[i], 0.01, 9.99),
ifelse(df$Value[i-1] == 0 | df$Value[i-1] == 10 | is.na(df$Value[i-1]),
df$Seq.Count[i-1]+1,df$Seq.Count[i-1]),
ifelse(is.na(df$Value[i]),
ifelse(!is.na(df$Value[i-1]), df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
df$Seq.Count[i-1]
)
)
)
)
}
现在这将给我以下内容:
time Value Seq.Count
1 0 0
2 0 0
3 3 1
4 4 1
5 10 2
6 10 2
7 10 2
8 7 3
9 6 3
10 NA NA
11 NA NA
12 NA NA
13 0 NA
14 0 NA
15 0 NA
在 "Value" 列中出现第一个 NA 后,"Seq.Count" 列的所有后续值都将是 NA
这是为什么?
根据代码中的这一行:
ifelse(is.na(df$Value[i]),
ifelse(!is.na(df$Value[i-1]), df$Seq.Count[i-1]+1, df$Seq.Count[i-1]), ...
它应该简单地从
中获取值
Seq.Count[i-1]
然后加1,if
is.na(df$Value[i])
和
!is.na(df$Value[i-1])
为什么这不起作用?
感谢您的帮助。
我想你需要这样的东西,使用 ifelse 并创建一个额外的 colomn 与以前的值进行比较
require(data.table)
test <- data.table(time = 1:15,
Value = c(0,0,3,4,10,10,10,7,6,NA,NA,NA,0,0,0))
# Add a column with the previous value
test[,previous_value := c(NA, test$Value[1: (nrow(test)-1)])]
# Check which group the previous value belongs
test[,group_1 := ifelse(is.na(previous_value),4,
ifelse(previous_value == 0,1,
ifelse(previous_value > 0 & previous_value < 10,2,
ifelse(previous_value == 10, 3, NA))))]
# Check which group current value belongs
test[,group_2 := ifelse(is.na(Value),4,
ifelse(Value == 0,1,
ifelse(Value > 0 & Value < 10,2,
ifelse(Value == 10, 3, NA))))]
# Compare them if they are not equal add 1
test[, Seq.count := cumsum(group_1 != group_2) - 1]
test
time Value previous_value group_1 group_2 Seq.count
1: 1 0 NA 4 1 0
2: 2 0 0 1 1 0
3: 3 3 0 1 2 1
4: 4 4 3 2 2 1
5: 5 10 4 2 3 2
6: 6 10 10 3 3 2
7: 7 10 10 3 3 2
8: 8 7 10 3 2 3
9: 9 6 7 2 2 3
10: 10 NA 6 2 4 4
11: 11 NA NA 4 4 4
12: 12 NA NA 4 4 4
13: 13 0 NA 4 1 5
14: 14 0 0 1 1 5
15: 15 0 0 1 1 5
这个解决方案怎么样?
tmp <- as.numeric(addNA(cut(df$Value,breaks=c(0,1,9,10),include.lowest=T)))-1
Seq.Count <- cumsum(abs(c(0,diff(tmp)))>0)
cbind(df[,-3],Seq.Count)
time Value Seq.Count
1 1 0 0
2 2 0 0
3 3 3 1
4 4 4 1
5 5 10 2
6 6 10 2
7 7 10 2
8 8 7 3
9 9 6 3
10 10 NA 4
11 11 NA 4
12 12 NA 4
13 13 0 5
14 14 0 5
15 15 0 5
详细说明 Marco Sandri 的回答
据我了解,您有几类价值观,例如:
- x == 0,被区间 [0,0.9]
覆盖
- 1 <= x <= 9,被区间覆盖 (0.9,9]
- x == 10,被区间覆盖 (9, 10]
- x 是 NA,NA 本身
解决您问题的代码是:
w <- cut(df$Value,breaks=c(0,0.9,9,10),include.lowest=T)
w1 <- addNA(w)
r <- w1 != lag(w1)
r[1] <- F
df$Seq.Count <- Reduce('+', r, accumulate = T)
分步说明
第一步是根据提到的类别对数据进行分类,cut()做了这样一个函数:
(w <- cut(df$Value,breaks=c(0,0.9,9,10),include.lowest=T))
[1] [0,0.9] [0,0.9] (0.9,9] (0.9,9] (9,10] (9,10] (9,10] (0.9,9] (0.9,9] <NA> <NA> <NA> [0,0.9] [0,0.9] [0,0.9]
Levels: [0,0.9] (0.9,9] (9,10]
您需要将 NA 作为结果的级别(类别)。这是通过addNA()完成的,如下:
(w1 <- addNA(w))
[1] [0,0.9] [0,0.9] (0.9,9] (0.9,9] (9,10] (9,10] (9,10] (0.9,9] (0.9,9] <NA> <NA> <NA> [0,0.9] [0,0.9] [0,0.9]
Levels: [0,0.9] (0.9,9] (9,10] <NA>
然后您需要知道该系列的 当前 元素与上一个 元素之间的类别是否发生了变化。为此,您需要生成该系列的 lag() 并将其与原始系列进行比较:
(r <- w1 != lag(w1))
[1] NA FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
# Change the first element to FALSE
r[1] <- F
r
[1] FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
请注意答案的第一个元素 r 是 NA。由于该系列的第一个元素没有可比较的(因为没有前一个元素),因此这种情况的结果应该是 FALSE。这就是 r[1] 被设置为 FALSE 的原因。
现在您需要对结果中的 TRUE 值求和,保留累积的答案。这是通过 Reduce():
完成的
(df$Seq.Count <- Reduce('+', r, accumulate = T))
[1] 0 0 1 1 2 2 2 3 3 4 4 4 5 5 5
编辑: 你也可以使用 cumsum() 而不是 Reduce()
我有一个这样的数据框:
time Value Seq.Count
1 0 0
2 0 0
3 3 0
4 4 0
5 10 0
6 10 0
7 10 0
8 7 0
9 6 0
10 NA 0
11 NA 0
12 NA 0
13 0 0
14 0 0
15 0 0
现在我想让 "Seq.Count" 列加一,每次 "Value" 列中的数字 X 在以下任一值之间变化时
0 == X, 0 < X > 10, X == 10, X == NA
所以我想最后得到类似下面的东西:
time Value Seq.Count
1 0 0
2 0 0
3 3 1
4 4 1
5 10 2
6 10 2
7 10 2
8 7 3
9 6 3
10 NA 4
11 NA 4
12 NA 4
13 0 5
14 0 5
15 0 5
我写了这段代码:
for (i in 2:nrow(df)) {
df$Seq.Count[i] <- ifelse(df$Value[i] == 10,
ifelse(df$Value[(i-1)] != 10, df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
ifelse(df$Value[i] == 0,
ifelse(df$Value[(i-1)] != 0, df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
ifelse(between(df$Value[i], 0.01, 9.99),
ifelse(df$Value[i-1] == 0 | df$Value[i-1] == 10 | is.na(df$Value[i-1]),
df$Seq.Count[i-1]+1,df$Seq.Count[i-1]),
ifelse(is.na(df$Value[i]),
ifelse(!is.na(df$Value[i-1]), df$Seq.Count[i-1]+1, df$Seq.Count[i-1]),
df$Seq.Count[i-1]
)
)
)
)
}
现在这将给我以下内容:
time Value Seq.Count
1 0 0
2 0 0
3 3 1
4 4 1
5 10 2
6 10 2
7 10 2
8 7 3
9 6 3
10 NA NA
11 NA NA
12 NA NA
13 0 NA
14 0 NA
15 0 NA
在 "Value" 列中出现第一个 NA 后,"Seq.Count" 列的所有后续值都将是 NA
这是为什么?
根据代码中的这一行:
ifelse(is.na(df$Value[i]),
ifelse(!is.na(df$Value[i-1]), df$Seq.Count[i-1]+1, df$Seq.Count[i-1]), ...
它应该简单地从
中获取值Seq.Count[i-1]
然后加1,if
is.na(df$Value[i])
和
!is.na(df$Value[i-1])
为什么这不起作用?
感谢您的帮助。
我想你需要这样的东西,使用 ifelse 并创建一个额外的 colomn 与以前的值进行比较
require(data.table)
test <- data.table(time = 1:15,
Value = c(0,0,3,4,10,10,10,7,6,NA,NA,NA,0,0,0))
# Add a column with the previous value
test[,previous_value := c(NA, test$Value[1: (nrow(test)-1)])]
# Check which group the previous value belongs
test[,group_1 := ifelse(is.na(previous_value),4,
ifelse(previous_value == 0,1,
ifelse(previous_value > 0 & previous_value < 10,2,
ifelse(previous_value == 10, 3, NA))))]
# Check which group current value belongs
test[,group_2 := ifelse(is.na(Value),4,
ifelse(Value == 0,1,
ifelse(Value > 0 & Value < 10,2,
ifelse(Value == 10, 3, NA))))]
# Compare them if they are not equal add 1
test[, Seq.count := cumsum(group_1 != group_2) - 1]
test
time Value previous_value group_1 group_2 Seq.count
1: 1 0 NA 4 1 0
2: 2 0 0 1 1 0
3: 3 3 0 1 2 1
4: 4 4 3 2 2 1
5: 5 10 4 2 3 2
6: 6 10 10 3 3 2
7: 7 10 10 3 3 2
8: 8 7 10 3 2 3
9: 9 6 7 2 2 3
10: 10 NA 6 2 4 4
11: 11 NA NA 4 4 4
12: 12 NA NA 4 4 4
13: 13 0 NA 4 1 5
14: 14 0 0 1 1 5
15: 15 0 0 1 1 5
这个解决方案怎么样?
tmp <- as.numeric(addNA(cut(df$Value,breaks=c(0,1,9,10),include.lowest=T)))-1
Seq.Count <- cumsum(abs(c(0,diff(tmp)))>0)
cbind(df[,-3],Seq.Count)
time Value Seq.Count
1 1 0 0
2 2 0 0
3 3 3 1
4 4 4 1
5 5 10 2
6 6 10 2
7 7 10 2
8 8 7 3
9 9 6 3
10 10 NA 4
11 11 NA 4
12 12 NA 4
13 13 0 5
14 14 0 5
15 15 0 5
详细说明 Marco Sandri 的回答
据我了解,您有几类价值观,例如:
- x == 0,被区间 [0,0.9] 覆盖
- 1 <= x <= 9,被区间覆盖 (0.9,9]
- x == 10,被区间覆盖 (9, 10]
- x 是 NA,NA 本身
解决您问题的代码是:
w <- cut(df$Value,breaks=c(0,0.9,9,10),include.lowest=T)
w1 <- addNA(w)
r <- w1 != lag(w1)
r[1] <- F
df$Seq.Count <- Reduce('+', r, accumulate = T)
分步说明
第一步是根据提到的类别对数据进行分类,cut()做了这样一个函数:
(w <- cut(df$Value,breaks=c(0,0.9,9,10),include.lowest=T))
[1] [0,0.9] [0,0.9] (0.9,9] (0.9,9] (9,10] (9,10] (9,10] (0.9,9] (0.9,9] <NA> <NA> <NA> [0,0.9] [0,0.9] [0,0.9]
Levels: [0,0.9] (0.9,9] (9,10]
您需要将 NA 作为结果的级别(类别)。这是通过addNA()完成的,如下:
(w1 <- addNA(w))
[1] [0,0.9] [0,0.9] (0.9,9] (0.9,9] (9,10] (9,10] (9,10] (0.9,9] (0.9,9] <NA> <NA> <NA> [0,0.9] [0,0.9] [0,0.9]
Levels: [0,0.9] (0.9,9] (9,10] <NA>
然后您需要知道该系列的 当前 元素与上一个 元素之间的类别是否发生了变化。为此,您需要生成该系列的 lag() 并将其与原始系列进行比较:
(r <- w1 != lag(w1))
[1] NA FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
# Change the first element to FALSE
r[1] <- F
r
[1] FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
请注意答案的第一个元素 r 是 NA。由于该系列的第一个元素没有可比较的(因为没有前一个元素),因此这种情况的结果应该是 FALSE。这就是 r[1] 被设置为 FALSE 的原因。
现在您需要对结果中的 TRUE 值求和,保留累积的答案。这是通过 Reduce():
完成的(df$Seq.Count <- Reduce('+', r, accumulate = T))
[1] 0 0 1 1 2 2 2 3 3 4 4 4 5 5 5
编辑: 你也可以使用 cumsum() 而不是 Reduce()