如何从隔离林结果中识别异常记录?

How can I identify the anomalous records from the Isolation Forest results?

我正在尝试使用 Solitude 包中的 Isolation Forest 算法来识别数据中的异常行。

我正在使用文档中的示例来了解该算法,此示例使用 Pima Indians Diabetes 数据集。

在示例的末尾,它提供了一个 ids 数据框,average_depth 和 anomaly_score 从最高分到最低分排序。

如何将模型的结果关联到原始数据集以查看异常分数最高的行?

这是包文档中的示例

library("solitude")
library("tidyverse")
library("mlbench")

data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes

splitter   = PimaIndiansDiabetes %>%
  select(-diabetes) %>%
  rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test  = rsample::testing(splitter)

iso = isolationForest$new()
iso$fit(pima_train)

scores_train = pima_train %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_train

umap_train = pima_train %>%
  scale() %>%
  uwot::umap() %>%
  setNames(c("V1", "V2")) %>%
  as_tibble() %>%
  rowid_to_column() %>%
  left_join(scores_train, by = c("rowid" = "id"))

umap_train

umap_train %>%
  ggplot(aes(V1, V2)) +
  geom_point(aes(size = anomaly_score))

scores_test = pima_test %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_test

嗯,这有点难。

如果此代码对您有帮助,请告诉我:


library("solitude")
library("tidyverse")
library("mlbench")

data(PimaIndiansDiabetes)
PimaIndiansDiabetes = as_tibble(PimaIndiansDiabetes)
PimaIndiansDiabetes

splitter   = PimaIndiansDiabetes %>%
  select(-diabetes) %>%
  rsample::initial_split(prop = 0.5)
pima_train = rsample::training(splitter)
pima_test  = rsample::testing(splitter)

iso = isolationForest$new()
iso$fit(pima_train)

scores_train = pima_train %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_train

umap_train = pima_train %>%
  scale() %>%
  uwot::umap() %>%
  setNames(c("V1", "V2")) %>%
  as_tibble() %>%
  rowid_to_column() %>%
  left_join(scores_train, by = c("rowid" = "id"))

umap_train

umap_train %>%
  ggplot(aes(V1, V2)) +
  geom_point(aes(size = anomaly_score))

scores_test = pima_test %>%
  iso$predict() %>%
  arrange(desc(anomaly_score))

scores_test

umap_train %>% left_join(scores_test, by = c("rowid" = "id"))

PimaIndiansDiabetes$id <- 1:nrow(PimaIndiansDiabetes)

scores_train$id <- splitter$in_id

scores_test$id <- PimaIndiansDiabetes$id[which(!PimaIndiansDiabetes$id %in% splitter$in_id)]

p1 <- PimaIndiansDiabetes %>% inner_join(scores_test, by = c("id"))

summary(p1)

p2 <- PimaIndiansDiabetes %>% inner_join(scores_train, by = c("id"))

summary(p2)

p3 <- rbind(p1,p2)

as_tibble(p3)

summary(p3)

你应该得到这个结果:

> p3 <- rbind(p1,p2)
> 
> as_tibble(p3)
# A tibble: 768 × 12
   pregnant glucose pressure triceps insulin  mass pedigree   age diabetes    id average_depth anomaly_score
      <dbl>   <dbl>    <dbl>   <dbl>   <dbl> <dbl>    <dbl> <dbl> <fct>    <int>         <dbl>         <dbl>
 1        6     148       72      35       0  33.6    0.627    50 pos          1          4.72         0.727
 2        8     183       64       0       0  23.3    0.672    32 pos          3          5.21         0.703
 3        1      89       66      23      94  28.1    0.167    21 neg          4          6.25         0.655
 4        3      78       50      32      88  31      0.248    26 pos          7          6.3          0.653
 5        2     197       70      45     543  30.5    0.158    53 pos          9          6.46         0.646
 6        8     125       96       0       0   0      0.232    54 pos         10          6.6          0.640
 7        7     100        0       0       0  30      0.484    32 pos         16          6.75         0.633
 8        0     118       84      47     230  45.8    0.551    31 pos         17          6.77         0.633
 9        1     103       30      38      83  43.3    0.183    33 neg         19          6.78         0.632
10        9     119       80      35       0  29      0.263    29 pos         24          6.85         0.629
# … with 758 more rows
> 
> summary(p3)
    pregnant         glucose         pressure         triceps         insulin           mass          pedigree     
 Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00   Min.   :  0.0   Min.   : 0.00   Min.   :0.0780  
 1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00   1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437  
 Median : 3.000   Median :117.0   Median : 72.00   Median :23.00   Median : 30.5   Median :32.00   Median :0.3725  
 Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54   Mean   : 79.8   Mean   :31.99   Mean   :0.4719  
 3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00   3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262  
 Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00   Max.   :846.0   Max.   :67.10   Max.   :2.4200  
      age        diabetes        id        average_depth   anomaly_score   
 Min.   :21.00   neg:500   Min.   :  1.0   Min.   :4.720   Min.   :0.5820  
 1st Qu.:24.00   pos:268   1st Qu.:192.8   1st Qu.:7.680   1st Qu.:0.5832  
 Median :29.00             Median :384.5   Median :7.910   Median :0.5856  
 Mean   :33.24             Mean   :384.5   Mean   :7.749   Mean   :0.5922  
 3rd Qu.:41.00             3rd Qu.:576.2   3rd Qu.:7.970   3rd Qu.:0.5947  
 Max.   :81.00             Max.   :768.0   Max.   :8.000   Max.   :0.7266