DESEQ2: varianceStabilizingTransformation Error: every gene contains at least one zero, cannot compute log geometric means

DESEQ2: varianceStabilizingTransformation Error: every gene contains at least one zero, cannot compute log geometric means

我有一个包含 OTU table 的数据集,其中行 = 样本,列 = OTU,如下所示:

Otus <- data.frame(OTU_1 = c(0, 0, 1), OTU_2 = c(12, 0, 5), OTU_3 = c(0, 5, 3), row.names = c("S_1", "S_2", "S_3"))

我使用 phyloseq_to_deseq2 命令将它从 phyloseq 移动到 DESeq2 没有问题。现在它是一个 DESeq2 对象,我想通过以下命令使用方差稳定变换对 OTU table 进行归一化:

vsd <- varianceStabilizingTransformation(DESeq2_Object, blind = TRUE)

但是,我不断收到此错误:

Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, : every gene contains at least one zero, cannot compute log geometric means

在对这个错误进行广泛研究后,我发现这意味着如果我的每个列(OTU_1、OTU_2、OTU_3 等)包含零,DESeq2 无法计算其几何平均值。这确实是我的情况,因为我的所有列都包含至少 1 个零,每一列。但是,我发现使用不同的命令来解决 完全相同的错误 来计算差异表达式。在这种情况下,解决方法是在 DESeq 命令中应用 sfType = poscounts,如下所示:

Diffs <- DESeq(DESeq2_Object, test = "Wald", fitType = "parametric", sfType = "poscounts")

但是,此命令不会填充我之后的方差稳定矩阵,它只会计算原始 OTU 的差异表达 table。

我查看了小插图并阅读了 varianceStabilizingTransformation 命令(使用 R),但它没有 sfType 标志来解决这个问题。

有没有办法绕过这个错误,这样我可以获得方差稳定矩阵?

方差稳定矩阵的预期用途是什么?然后你打算使用矩阵来计算差异丰度吗?在 this tutorial, the authors (Susan Holmes and Joey McMurdie) refer to this tutorial 中,用于使用更改后的几何平均公式进行差异丰度测试,例如

#BiocManager::install("phyloseq")
#BiocManager::install("DESeq2")
library(phyloseq)
library(DESeq2)
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> Loading required package: IRanges
#> Loading required package: GenomicRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats


# Load example data
otufile = system.file("extdata", "GP_otu_table_rand_short.txt.gz", package = "phyloseq")
mapfile = system.file("extdata", "master_map.txt", package = "phyloseq")
trefile = system.file("extdata", "GP_tree_rand_short.newick.gz", package = "phyloseq")
qiimex = import_qiime(otufile, mapfile, trefile, showProgress = FALSE)
#> Processing map file...
#> Processing otu/tax file...
#> Reading file into memory prior to parsing...
#> Detecting first header line...
#> Header is on line 2  
#> Converting input file to a table...
#> Defining OTU table... 
#> Parsing taxonomy table...
#> Processing phylogenetic tree...
#>  /Library/Frameworks/R.framework/Versions/4.1/Resources/library/phyloseq/extdata/GP_tree_rand_short.newick.gz ...
#> Warning in import_qiime(otufile, mapfile, trefile, showProgress = FALSE):
#> treefilename failed import. It will not be included.
qiimex
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 500 taxa and 26 samples ]
#> sample_data() Sample Data:       [ 26 samples by 7 sample variables ]
#> tax_table()   Taxonomy Table:    [ 500 taxa by 7 taxonomic ranks ]

# Select samples
qiimex@sam_data$SampleType
#>  [1] "Freshwater (creek)" "Freshwater (creek)" "Freshwater (creek)"
#>  [4] "Soil"               "Soil"               "Mock"              
#>  [7] "Mock"               "Mock"               "Skin"              
#> [10] "Freshwater"         "Feces"              "Skin"              
#> [13] "Tongue"             "Feces"              "Skin"              
#> [16] "Tongue"             "Ocean"              "Ocean"             
#> [19] "Ocean"              "Freshwater"         "Soil"              
#> [22] "Sediment (estuary)" "Sediment (estuary)" "Sediment (estuary)"
#> [25] "Feces"              "Feces"
qiimex_subset <- subset_samples(qiimex, SampleType == "Soil" | SampleType == "Mock")

# Convert to DESeq2 object
DESeq2_Object <- phyloseq_to_deseq2(qiimex_subset, ~ SampleType)
#> converting counts to integer mode
#> Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
#> design formula are characters, converting to factors

# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(DESeq2_Object), 1, gm_mean)
DESeq2_dds = estimateSizeFactors(DESeq2_Object, geoMeans = geoMeans)
DESeq2_dds = DESeq(DESeq2_dds, fitType="local")
#> using pre-existing size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing

# Explore results
res = results(DESeq2_dds)
res = res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab = res[(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(qiimex_subset)[rownames(sigtab), ], "matrix"))
head(sigtab)
#>         baseMean log2FoldChange    lfcSE      stat       pvalue         padj
#> 184403 3316.1856     -11.604876 1.446460 -8.022948 1.032372e-15 1.827298e-13
#> 193343  820.0846     -10.147175 1.317754 -7.700354 1.356898e-14 1.200855e-12
#> 10113   316.3938      -9.216756 1.313276 -7.018143 2.248365e-12 1.326535e-10
#> 234421  238.0599     -11.167192 1.657220 -6.738510 1.600195e-11 7.080861e-10
#> 526081  259.6722      -9.346145 1.471139 -6.352999 2.111570e-10 7.474959e-09
#> 313166 1322.7565      -9.911298 1.593579 -6.219519 4.986800e-10 1.471106e-08
#>         Kingdom         Phylum               Class             Order
#> 184403 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 193343 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 10113  Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales
#> 234421 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 526081 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 313166 Bacteria  Bacteroidetes         Bacteroidia     Bacteroidales
#>                    Family        Genus            Species
#> 184403    Lachnospiraceae         <NA>               <NA>
#> 193343    Lachnospiraceae    Roseburia Eubacteriumrectale
#> 10113  Enterobacteriaceae         <NA>               <NA>
#> 234421    Lachnospiraceae Ruminococcus               <NA>
#> 526081    Lachnospiraceae Oribacterium               <NA>
#> 313166     Bacteroidaceae  Bacteroides               <NA>
posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
posigtab
#>          baseMean log2FoldChange    lfcSE         padj          Phylum
#> 303295 201.335216       6.506674 1.204792 1.305775e-06  Proteobacteria
#> 5552   196.834531       6.309543 1.178371 1.518862e-06  Proteobacteria
#> 236550  69.552022       8.770931 1.815173 1.840288e-05  Proteobacteria
#> 113626 844.163976       7.207704 1.503844 2.078799e-05   Acidobacteria
#> 97627  171.891263       6.169004 1.307597 2.812963e-05  Proteobacteria
#> 89763   52.875334       8.375284 1.795633 3.426524e-05  Actinobacteria
#> 306684 149.608143       5.964643 1.346820 9.322811e-05   Bacteroidetes
#> 218985  50.742401       8.315832 1.882803 9.334941e-05     Chloroflexi
#> 102382 335.108981       6.747414 1.555112 1.267530e-04  Proteobacteria
#> 206632 332.777997       7.486585 1.747926 1.553274e-04 Verrucomicrobia
#> 573607 178.082806       6.729167 1.614098 2.360541e-04   Acidobacteria
#> 593006 146.954357       7.392336 1.781983 2.469451e-04  Proteobacteria
#> 279590 151.624630       6.576054 1.637287 3.873396e-04   Acidobacteria
#> 89337   56.919233       6.692225 1.663102 3.873396e-04  Actinobacteria
#> 218035  43.876643       7.133246 1.793490 4.373432e-04  Actinobacteria
#> 368027  18.296878       6.844054 1.791678 7.384186e-04  Actinobacteria
#> 572242  62.103316       6.140545 1.616489 7.802034e-04  Planctomycetes
#> 588604 211.108277       6.589139 1.738856 7.862979e-04  Proteobacteria
#> 212596 183.109699       6.931159 1.846832 8.501324e-04   Acidobacteria
#> 237963 113.390704       7.549096 2.013483 8.501324e-04  Proteobacteria
#> 548077  23.508540       7.205060 1.921985 8.501324e-04  Actinobacteria
#> 222914 348.580255       7.393221 2.040394 1.319435e-03 Verrucomicrobia
#> 265094 519.413992       5.885380 1.636179 1.389556e-03  Actinobacteria
#> 341901  26.562097       7.381503 2.068411 1.512002e-03  Actinobacteria
#> 104310  11.372452       6.157420 1.746257 1.715256e-03  Actinobacteria
#> 561842  51.066519       5.459834 1.549686 1.715256e-03  Proteobacteria
#> 248299  16.577945       6.700539 1.906851 1.736686e-03  Actinobacteria
#> 166835 868.093822       6.189342 1.766318 1.762950e-03  Proteobacteria
#> 369734  32.758200       6.711075 1.919352 1.775080e-03  Proteobacteria
#> 295422  15.101303       6.565460 1.913842 2.221501e-03  Proteobacteria
#> 137099  25.876135       6.367122 1.867180 2.254420e-03 Verrucomicrobia
#> 254706  13.039735       6.358471 1.862435 2.254420e-03   Acidobacteria
#> 204462 137.941401       6.613294 1.963268 2.572456e-03   Acidobacteria
#> 160337  18.708823       5.893400 1.783191 3.172144e-03  Actinobacteria
#> 144065  23.284353       7.191156 2.184408 3.260211e-03  Proteobacteria
#> 113959 894.081156       6.136411 1.870146 3.325954e-03   Acidobacteria
#> 223948  10.185674       6.003017 1.833057 3.341251e-03   Acidobacteria
#> 160135   8.669323       5.764444 1.764057 3.366660e-03  Actinobacteria
#> 244840   8.505259       5.736708 1.759948 3.404914e-03  Planctomycetes
#> 240591 179.115203       5.662464 1.749499 3.628632e-03  Proteobacteria
#> 547752  22.387044       6.156040 1.918045 3.921972e-03  Proteobacteria
#> 163176  17.574738       5.803532 1.823396 4.230077e-03  Proteobacteria
#> 321885  24.312098       6.270669 2.013227 5.172611e-03  Planctomycetes
#> 329327  16.600143       5.717837 1.839840 5.212980e-03   Fibrobacteres
#> 262115  25.392319       7.316622 2.362343 5.319945e-03             SC4
#> 512616  15.661404       5.632695 1.854032 6.385192e-03  Proteobacteria
#> 261663  25.664154       4.953787 1.643615 6.812189e-03     Chloroflexi
#> 257151  16.884984       4.909367 1.658737 8.015407e-03   Bacteroidetes
#> 238929  25.265076       4.509092 1.544313 8.856346e-03 Verrucomicrobia
#> 322045   6.504135       5.355636 1.834044 8.856346e-03  Proteobacteria
#>                      Class                      Family                Genus
#> 303295 Deltaproteobacteria               Haliangiaceae                 <NA>
#> 5552   Alphaproteobacteria            Caulobacteraceae     Phenylobacterium
#> 236550  Betaproteobacteria            Oxalobacteraceae             Massilia
#> 113626  Chloracidobacteria                        <NA>                 <NA>
#> 97627   Betaproteobacteria              Comamonadaceae          Hylemonella
#> 89763       Actinobacteria                        <NA>                 <NA>
#> 306684     Sphingobacteria                        <NA>                 <NA>
#> 218985              SOGA31                        <NA>                 <NA>
#> 102382  Betaproteobacteria            Oxalobacteraceae             Massilia
#> 206632      Spartobacteria          Spartobacteriaceae                 MC18
#> 573607  Chloracidobacteria                        <NA>                 <NA>
#> 593006 Gammaproteobacteria             Sinobacteraceae                 <NA>
#> 279590       Acidobacteria                        <NA>                 <NA>
#> 89337       Actinobacteria          Pseudonocardiaceae    Actinoalloteichus
#> 218035      Actinobacteria           Microbacteriaceae                 <NA>
#> 368027      Actinobacteria                        <NA>                 <NA>
#> 572242        Planctomycea                 Gemmataceae                 <NA>
#> 588604 Alphaproteobacteria           Rhodospirillaceae                 <NA>
#> 212596        Solibacteres             Solibacteraceae CandidatusSolibacter
#> 237963 Deltaproteobacteria                        <NA>                 <NA>
#> 548077      Actinobacteria                        <NA>                 <NA>
#> 222914    Verrucomicrobiae Verrucomicrobiasubdivision3                 <NA>
#> 265094      Actinobacteria          Micromonosporaceae       Micromonospora
#> 341901      Actinobacteria             Nocardioidaceae                 <NA>
#> 104310      Actinobacteria                 Frankiaceae                 <NA>
#> 561842 Alphaproteobacteria                        <NA>                 <NA>
#> 248299      Actinobacteria                        <NA>                 <NA>
#> 166835  Betaproteobacteria            Burkholderiaceae         Burkholderia
#> 369734 Alphaproteobacteria           Rhodospirillaceae                 <NA>
#> 295422  Betaproteobacteria              Alcaligenaceae           Sutterella
#> 137099    Verrucomicrobiae                        <NA>                 <NA>
#> 254706  Chloracidobacteria                        <NA>                 <NA>
#> 204462        Solibacteres             Solibacteraceae CandidatusSolibacter
#> 160337      Actinobacteria                 Frankiaceae                 <NA>
#> 144065 Alphaproteobacteria            Acetobacteraceae                 <NA>
#> 113959                <NA>                        <NA>                 <NA>
#> 223948  Chloracidobacteria                        <NA>                 <NA>
#> 160135      Actinobacteria           Streptomycetaceae         Streptomyces
#> 244840        Planctomycea               Pirellulaceae            Pirellula
#> 240591 Alphaproteobacteria          Phyllobacteriaceae                 <NA>
#> 547752  Betaproteobacteria            Burkholderiaceae         Burkholderia
#> 163176  Betaproteobacteria              Rhodocyclaceae    Methyloversatilis
#> 321885        Planctomycea                 Gemmataceae                 <NA>
#> 329327       Fibrobacteres            Fibrobacteraceae          Fibrobacter
#> 262115                <NA>                        <NA>                 <NA>
#> 512616 Gammaproteobacteria             Sinobacteraceae                 <NA>
#> 261663             Bljii12                     Dolo_23                 <NA>
#> 257151     Sphingobacteria                        <NA>                 <NA>
#> 238929    Verrucomicrobiae                        <NA>                 <NA>
#> 322045  Betaproteobacteria            Oxalobacteraceae             Massilia

# Plot the results
library(ggplot2)
theme_set(theme_bw())
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=6) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

reprex package (v2.0.1)

于 2021-10-08 创建

该教程发布于 2021 年 5 月,所以它是最新的,他们基本上跳过了 'intermediate step' 您的问题相关的内容,但是如果您需要其他一些方差稳定矩阵一个潜在的解决方案是使用 Varistran 应用 Anscombe 的方差稳定变换(这也提供了一些不错的绘图功能),例如

#devtools::install_github("MonashBioinformaticsPlatform/varistran")
#BiocManager::install("edgeR")
library(varistran)
#> Loading required package: grid

counts <- DESeq2_Object@assays@data$counts
design <- model.matrix(~ qiimex_subset@sam_data$SampleType)

y <- vst(counts, design=design)
#> Dispersion estimated as 0.5672283

# "y" contains the transformed counts
y
#>               CC1        CL3      Even1      Even2      Even3        SV1
#> 338272 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 289855 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 287759  0.2967411 -0.3192927 -0.3192927 -0.3192927  1.5903105  4.0624033
#> 334839 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 144065  3.2103181  6.8071310 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 18643  -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 218985  6.1359268  3.1727994 -0.3192927 -0.3192927 -0.3192927  8.2069561
#> 238997 -0.3192927  0.5037921 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 245893 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 131765 -0.3192927 -0.3192927 -0.3192927 -0.3192927  1.5903105  0.7855066
#> 536982 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 351629  5.0182785  4.5937241  5.4609218  4.0571176  2.8729479  5.3692361
#> 321069 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 512157 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> ...
#> attr(,"lib.size")
#>       CC1       CL3     Even1     Even2     Even3       SV1 
#> 84395.682 57904.079  1560.595  4272.039 15659.007 38299.983 
#> attr(,"true.lib.size")
#>   CC1   CL3 Even1 Even2 Even3   SV1 
#> 31125 37942 10184  7169  6597 34353 
#> attr(,"lib.size.method")
#> [1] "TMM normalization"
#> attr(,"cpm")
#> [1] FALSE
#> attr(,"method")
#> [1] "anscombe.nb"
#> attr(,"dispersion")
#> [1] 0.5672283
plot_stability(y, counts, design=design)
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Transformation introduced infinite values in continuous x-axis

plot_biplot(y)

plot_heatmap(y, n=50, cluster_samples = TRUE)
#> Registered S3 method overwritten by 'seriation':
#>   method         from 
#>   reorder.hclust vegan

reprex package (v2.0.1)

于 2021-10-08 创建