如何从隔离林结果中识别异常记录?
How can I identify the anomalous records from the Isolation Forest results?
我正在尝试使用 Solitude 包中的 Isolation Forest 算法来识别数据中的异常行。
我正在使用文档中的示例来了解该算法,此示例使用 Pima Indians Diabetes 数据集。
在示例的末尾,它提供了一个 ids 数据框,average_depth 和 anomaly_score 从最高分到最低分排序。
如何将模型的结果关联到原始数据集以查看异常分数最高的行?
这是包文档中的示例
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
嗯,这有点难。
如果此代码对您有帮助,请告诉我:
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
umap_train %>% left_join(scores_test, by = c("rowid" = "id"))
PimaIndiansDiabetes$id <- 1:nrow(PimaIndiansDiabetes)
scores_train$id <- splitter$in_id
scores_test$id <- PimaIndiansDiabetes$id[which(!PimaIndiansDiabetes$id %in% splitter$in_id)]
p1 <- PimaIndiansDiabetes %>% inner_join(scores_test, by = c("id"))
summary(p1)
p2 <- PimaIndiansDiabetes %>% inner_join(scores_train, by = c("id"))
summary(p2)
p3 <- rbind(p1,p2)
as_tibble(p3)
summary(p3)
你应该得到这个结果:
> p3 <- rbind(p1,p2)
>
> as_tibble(p3)
# A tibble: 768 × 12
pregnant glucose pressure triceps insulin mass pedigree age diabetes id average_depth anomaly_score
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <int> <dbl> <dbl>
1 6 148 72 35 0 33.6 0.627 50 pos 1 4.72 0.727
2 8 183 64 0 0 23.3 0.672 32 pos 3 5.21 0.703
3 1 89 66 23 94 28.1 0.167 21 neg 4 6.25 0.655
4 3 78 50 32 88 31 0.248 26 pos 7 6.3 0.653
5 2 197 70 45 543 30.5 0.158 53 pos 9 6.46 0.646
6 8 125 96 0 0 0 0.232 54 pos 10 6.6 0.640
7 7 100 0 0 0 30 0.484 32 pos 16 6.75 0.633
8 0 118 84 47 230 45.8 0.551 31 pos 17 6.77 0.633
9 1 103 30 38 83 43.3 0.183 33 neg 19 6.78 0.632
10 9 119 80 35 0 29 0.263 29 pos 24 6.85 0.629
# … with 758 more rows
>
> summary(p3)
pregnant glucose pressure triceps insulin mass pedigree
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. :0.0780
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00 Median : 30.5 Median :32.00 Median :0.3725
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54 Mean : 79.8 Mean :31.99 Mean :0.4719
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00 Max. :846.0 Max. :67.10 Max. :2.4200
age diabetes id average_depth anomaly_score
Min. :21.00 neg:500 Min. : 1.0 Min. :4.720 Min. :0.5820
1st Qu.:24.00 pos:268 1st Qu.:192.8 1st Qu.:7.680 1st Qu.:0.5832
Median :29.00 Median :384.5 Median :7.910 Median :0.5856
Mean :33.24 Mean :384.5 Mean :7.749 Mean :0.5922
3rd Qu.:41.00 3rd Qu.:576.2 3rd Qu.:7.970 3rd Qu.:0.5947
Max. :81.00 Max. :768.0 Max. :8.000 Max. :0.7266
我正在尝试使用 Solitude 包中的 Isolation Forest 算法来识别数据中的异常行。
我正在使用文档中的示例来了解该算法,此示例使用 Pima Indians Diabetes 数据集。
在示例的末尾,它提供了一个 ids 数据框,average_depth 和 anomaly_score 从最高分到最低分排序。
如何将模型的结果关联到原始数据集以查看异常分数最高的行?
这是包文档中的示例
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
嗯,这有点难。
如果此代码对您有帮助,请告诉我:
library("solitude")
library("tidyverse")
library("mlbench")
data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes
splitter = PimaIndiansDiabetes %>%
select(-diabetes) %>%
rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test = rsample::testing(splitter)
iso = isolationForest$new()
iso$fit(pima_train)
scores_train = pima_train %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_train
umap_train = pima_train %>%
scale() %>%
uwot::umap() %>%
setNames(c("V1", "V2")) %>%
as_tibble() %>%
rowid_to_column() %>%
left_join(scores_train, by = c("rowid" = "id"))
umap_train
umap_train %>%
ggplot(aes(V1, V2)) +
geom_point(aes(size = anomaly_score))
scores_test = pima_test %>%
iso$predict() %>%
arrange(desc(anomaly_score))
scores_test
umap_train %>% left_join(scores_test, by = c("rowid" = "id"))
PimaIndiansDiabetes$id <- 1:nrow(PimaIndiansDiabetes)
scores_train$id <- splitter$in_id
scores_test$id <- PimaIndiansDiabetes$id[which(!PimaIndiansDiabetes$id %in% splitter$in_id)]
p1 <- PimaIndiansDiabetes %>% inner_join(scores_test, by = c("id"))
summary(p1)
p2 <- PimaIndiansDiabetes %>% inner_join(scores_train, by = c("id"))
summary(p2)
p3 <- rbind(p1,p2)
as_tibble(p3)
summary(p3)
你应该得到这个结果:
> p3 <- rbind(p1,p2)
>
> as_tibble(p3)
# A tibble: 768 × 12
pregnant glucose pressure triceps insulin mass pedigree age diabetes id average_depth anomaly_score
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <int> <dbl> <dbl>
1 6 148 72 35 0 33.6 0.627 50 pos 1 4.72 0.727
2 8 183 64 0 0 23.3 0.672 32 pos 3 5.21 0.703
3 1 89 66 23 94 28.1 0.167 21 neg 4 6.25 0.655
4 3 78 50 32 88 31 0.248 26 pos 7 6.3 0.653
5 2 197 70 45 543 30.5 0.158 53 pos 9 6.46 0.646
6 8 125 96 0 0 0 0.232 54 pos 10 6.6 0.640
7 7 100 0 0 0 30 0.484 32 pos 16 6.75 0.633
8 0 118 84 47 230 45.8 0.551 31 pos 17 6.77 0.633
9 1 103 30 38 83 43.3 0.183 33 neg 19 6.78 0.632
10 9 119 80 35 0 29 0.263 29 pos 24 6.85 0.629
# … with 758 more rows
>
> summary(p3)
pregnant glucose pressure triceps insulin mass pedigree
Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. :0.0780
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437
Median : 3.000 Median :117.0 Median : 72.00 Median :23.00 Median : 30.5 Median :32.00 Median :0.3725
Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54 Mean : 79.8 Mean :31.99 Mean :0.4719
3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00 Max. :846.0 Max. :67.10 Max. :2.4200
age diabetes id average_depth anomaly_score
Min. :21.00 neg:500 Min. : 1.0 Min. :4.720 Min. :0.5820
1st Qu.:24.00 pos:268 1st Qu.:192.8 1st Qu.:7.680 1st Qu.:0.5832
Median :29.00 Median :384.5 Median :7.910 Median :0.5856
Mean :33.24 Mean :384.5 Mean :7.749 Mean :0.5922
3rd Qu.:41.00 3rd Qu.:576.2 3rd Qu.:7.970 3rd Qu.:0.5947
Max. :81.00 Max. :768.0 Max. :8.000 Max. :0.7266