PCA-LDA分析-R

Question

在此示例 (https://gist.github.com/thigm85/8424654) 中，在鸢尾花数据集上检查了 LDA 与 PCA。我怎样才能对 PCA 结果进行 LDA (PCA-LDA)？

代码：

require(MASS)
require(ggplot2)
require(scales)
require(gridExtra)

pca <- prcomp(iris[,-5],
              center = TRUE,
              scale. = TRUE) 

prop.pca = pca$sdev^2/sum(pca$sdev^2)

lda <- lda(Species ~ ., 
           iris, 
           prior = c(1,1,1)/3)

prop.lda = lda$svd^2/sum(lda$svd^2)

plda <- predict(object = lda,
                newdata = iris)

dataset = data.frame(species = iris[,"Species"],
                     pca = pca$x, lda = plda$x)

p1 <- ggplot(dataset) + geom_point(aes(lda.LD1, lda.LD2, colour = species, shape = species), size = 2.5) + 
  labs(x = paste("LD1 (", percent(prop.lda[1]), ")", sep=""),
       y = paste("LD2 (", percent(prop.lda[2]), ")", sep=""))

p2 <- ggplot(dataset) + geom_point(aes(pca.PC1, pca.PC2, colour = species, shape = species), size = 2.5) +
  labs(x = paste("PC1 (", percent(prop.pca[1]), ")", sep=""),
       y = paste("PC2 (", percent(prop.pca[2]), ")", sep=""))

grid.arrange(p1, p2)

Answer 1

这很简单，将lda应用于问题代码中princomp返回的主成分坐标。

pca_lda <- lda(pca$x, grouping = iris$Species)

现在需要对每个对象类型使用方法 predict 来获得分类的准确性。

pred_pca_lda <- predict(lda0, predict(pca, iris))

accuracy_lda <- mean(plda$class == iris$Species)
accuracy_pca_lda <- mean(pred_pca_lda$class == iris$Species)

accuracy_lda
#[1] 0.98
accuracy_pca_lda
#[1] 0.98

Answer 2

通常您会在执行 PCA 之前执行 PCA-LDA 来降低数据的维度。理想情况下，您决定要从 PCA 中保留的前 k 个组件。在您使用 iris 的示例中，我们采用前 2 个组件，否则它看起来与没有 PCA 时几乎相同。

这样试试：

pcdata = data.frame(pca$x[,1:2],Species=iris$Species)
pc_lda <- lda(Species ~ .,data=pcdata , prior = c(1,1,1)/3)
prop_pc_lda = pc_lda$svd^2/sum(pc_lda$svd^2)
pc_plda <- predict(object = pc_lda,newdata = pcdata)

dataset = data.frame(species = iris[,"Species"],pc_plda$x)

p3 <- ggplot(dataset) + geom_point(aes(LD1, LD2, colour = species, shape = species), size = 2.5) + 
  labs(x = paste("LD1 (", percent(prop_pc_lda[1]), ")", sep=""),
       y = paste("LD2 (", percent(prop_pc_lda[2]), ")", sep=""))

print(p3)

您在这里看不到太大差异，因为 PCA 的前 2 个分量捕获了鸢尾花数据集中的大部分差异。

Answer 3

对于PCA-LDA（也称为主成分判别分析，DAPC）重要的是找到欠拟合之间的最佳折衷和数据过拟合。这对于高维数据（许多变量 = 列）尤其棘手。在这种情况下，它变成了 机器学习 问题，需要使用 交叉验证 来解决。唯一要调整的 超参数 是保留 PC 的数量。

adegenet 包提供了 PCA-LDA 的方便实现（主要考虑遗传数据，但它可用于任何类型的数据）。 xvalDapc 函数在 bootstrap 个数据子集（training.set 参数）上拟合 n.rep LDA 模型，用于所有指定数量的保留 PC（n.pca 参数）。使用指定的性能指标（result 参数），它会在所有重复中找到具有最低相关均方误差 (MSE) 的 n.pca。

调整 n.pca 用于 DAPC

library(adegenet)

# center and scale data to unit variance
iris.s <- scale(iris[,1:4])

# tune PCA-LDA / DAPC hyperparameter (n.pca)
d.dapc <- xvalDapc(x = iris.s, grp = iris[,5],
                   scale = FALSE, center = TRUE, result = "groupMean",
                   training.set = 2/3, n.pca = 1:3, n.rep = 100)
(n.pca <- as.numeric(d.dapc$`Number of PCs Achieving Lowest MSE`))

请注意，缩放在这里很有意义，因为花瓣和萼片的长度远远大于宽度。对未缩放数据的分析将导致 PC 组件因花瓣和萼片长度（由于它们较大的方差）而大量加载，即使花瓣宽度是一个重要的区别特征（见下图）。

最终的 DAPC 模型 d.dapc$DAPC 适合完整的训练数据，使用最优 n.pca，发现是 3.

一旦使用交叉验证确定了最佳 n.pca，使用一些额外的结果（原始变量的加载 $var.load）重新拟合模型的快速方法是 dapc 函数（由 xvalDapc 使用）：

dd.dapc <- dapc(x = iris.s, grp = iris[,5], n.pca = n.pca, n.da = 2,
                scale = FALSE, center = TRUE, var.loadings = TRUE)

dapc 对象 class 包含各种插槽，其中最重要的是：

DAPC 分数 ($ind.var)
原始变量对 LD 函数的载荷或系数 ($var.load)
每个变量对LD函数的绝对贡献($var.contr)
特征值，用于计算解释方差百分比 ($eig)

计算解释方差百分比

> (expl_var <- dd.dapc$eig / sum(dd.dapc$eig))
[1] 0.991500694 0.008499306

比较dapc和prcomp/lda

iris.pca <- prcomp(iris.s, center = TRUE, scale. = FALSE)
iris.lda <- lda(x = iris.pca$x[,1:n.pca], grouping = iris[,5])

scores.own <- predict(iris.lda)$x

# check conformity
scores.dapc <- dd.dapc$ind.coord
all.equal(unname(scores.own), unname(scores.dapc)) # it's the same!
[1] TRUE

可视化

par(mfrow=c(2,1))
scatter(dd.dapc)
loadingplot(dd.dapc$var.contr, axis = 1, main = "Loadings on LD1")
par(mfrow=c(1,1))

Answer 4

这里有另一种方法 PCA-LDA a.k.a。 R 中的 DAPC，如果必须为 LDA 找到最佳数量的保留主成分（对于具有许多预测变量的大型数据集，通常必须这样做）。

此解决方案使用 tidymodels package for hyperparameter tuning, applied to the Glass dataset.

依赖项

library(mlbench)
data(Glass)

library(tidymodels)
library(discrim) # with tidymodels wrapper for MASS::lda

使用MASS引擎指定tidymodels LDA分类模型

mod <- discrim::discrim_linear(mode = "classification", engine = "MASS")
mod %>% translate() # this is what the discrim::discrim_linear wrapper will do

#Linear Discriminant Model Specification (classification)
#
#Computational engine: MASS 
#
#Model fit template:
#MASS::lda(formula = missing_arg(), data = missing_arg())

创建 tidymodels 预处理方法（scale+center，PCA）

# specify missing_arg() [formula, data] using a tidymodels recipe
rec <- recipe(formula = Type ~ ., data = Glass) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_pca(all_numeric_predictors(), num_comp = tune())

指定调整网格和控制参数

# tuning grid with hyperparameters in columns    
# column name(s) must match tune() above
tuneGrid <- expand.grid(num_comp = 1:ncol(rec$template))

# control tune_grid() process below
trControl <- control_grid(verbose = TRUE, allow_par = FALSE)

定义 tidymodels 工作流程（预处理步骤、模型拟合）

wflow <- workflow(preprocessor = rec, spec = mod)

指定交叉验证的折叠数

set.seed(8482)
folds <- vfold_cv(rec$template, v = 5, repeats = 1, strata = "Type", pool = 0.1)

拟合一系列 LDA 模型

# takes a while to process, decrease v and repeats above to speed up
fit_train <- wflow %>%
  tune_grid(resamples = folds,
            grid = tuneGrid,
            metrics = metric_set(accuracy),# or specify multiple metrics
            control = trControl)

提取并绘制性能指标

met_train <- fit_train %>% collect_metrics()

ggplot(met_train, aes(x = num_comp, y = mean)) +
  geom_line(color = "#3E4A89FF", size = 2, alpha = 0.6) +
  scale_x_continuous(breaks = 1:ncol(rec$template)) +
  facet_wrap(~.metric) +
  theme_bw()

7 台 PC 似乎足以分类：

使用最佳超参数更新工作流程

# show best 5
fit_train %>% show_best(metric = "accuracy")

## A tibble: 5 × 7
#  num_comp .metric  .estimator  mean     n std_err .config              
#     <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
#1        9 accuracy multiclass 0.626     5  0.0207 Preprocessor09_Model1
#2       10 accuracy multiclass 0.626     5  0.0207 Preprocessor10_Model1
#3        7 accuracy multiclass 0.626     5  0.0220 Preprocessor07_Model1
#4        8 accuracy multiclass 0.598     5  0.0225 Preprocessor08_Model1
#5        6 accuracy multiclass 0.579     5  0.0221 Preprocessor06_Model1

# select best, e.g. by applying the one-standard-error-rule
(bestTune <- fit_train %>% select_by_one_std_err(num_comp, metric = "accuracy"))

## A tibble: 1 × 9
#  num_comp .metric  .estimator  mean     n std_err .config               .best .bound
#     <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                 <dbl>  <dbl>
#1        7 accuracy multiclass 0.626     5  0.0220 Preprocessor07_Model1 0.626  0.605


# finalize workflow
wflow_final <- wflow %>% finalize_workflow(bestTune)

# verify that the workflow was updated correctly
wflow$pre$actions$recipe$recipe$steps[[2]]$num_comp # should be tune()
# tune()

wflow_final$pre$actions$recipe$recipe$steps[[2]]$num_comp # should be 7
#7

将最终 PCA-LDA 模型拟合到完整（训练）数据集

fit_final <- wflow_final %>% fit(Glass)

class(fit_final$fit$fit$fit) # here is the MASS::lda object
#lda

可视化拟合 PCA-LDA 模型

# use predict() to get predicted class, posterior probs, and LDA scores
Glass.PCALDA <- tibble(Glass,
       predict(fit_final, new_data = Glass, type = "class"), # predicted class
       predict(fit_final, new_data = Glass, type = "prob"), # posterior prob. for classes
       as_tibble(predict(fit_final, new_data = Glass, type = "raw")$x)) # LD scores

# verify that tidymodels did it right
Own.PCALDA <- lda(prcomp(Glass[,-10], center = T, scale. = T)$x[,1:7],
                  grouping = Glass[,10])
Own.PCALDA$x <- predict(Own.PCALDA)$x

all.equal(as_tibble(Own.PCALDA$x),
          Glass.PCALDA %>% dplyr::select(starts_with("LD"))) # it's the same!
#TRUE

# plot
ggplot(Glass.PCALDA, aes(x = LD1, y = LD2)) +
  geom_point(aes(color = Type, shape = .pred_class)) + 
  theme_bw() +
  ggtitle("PCA-LDA (DAPC) on Glass dataset, using 7 PC")

比较 PCA-LDA 和 LDA

Own.LDA <- lda(scale(Glass[,-10], center = T, scale = T), grouping = Glass[,10])
Own.LDA$predict <- predict(Own.LDA)

Glass.LDA <- tibble(Glass,
                    .pred_class = Own.LDA$predict$class,
                    as_tibble(Own.LDA$predict$posterior) %>% rename_all(list(function(x){paste0(".pred_", x)})),
                    as_tibble(Own.LDA$predict$x))

# plot
ggplot(Glass.LDA, aes(x = LD1, y = LD2)) +
  geom_point(aes(color = Type, shape = .pred_class)) + 
  theme_bw() +
  ggtitle("LDA on Glass dataset")

# compare model accuracies
accuracy(Glass.PCALDA, truth = Type, estimate = .pred_class) # 66.8%
accuracy(Glass.LDA, truth = Type, estimate = .pred_class) # 67.3%

对于这个小数据集，通过 PCA 降维并不能带来更好的分类结果。然而，PCA-LDA 程序允许将 LDA 应用于具有高度多重共线性（许多高度相关的预测变量）的非常大的数据集。 PCA 步骤消除了多重共线性（这可能是 LDA 的一个问题），上面显示的交叉验证过程确定了分类的最佳降维（到 7 PC）。