Compute variance explained under different K

Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018e.
1.0/zoneinfo/Asia/Taipei'

Last updated: 2018-08-12

Code version: f664d4c

Background and summary

We compare fitting under differet k and computed for every feature, sample variance in abundance that is explained by sample cluster membership.

Results showed that k=2 gives the largest variance explained (median of sample variances is higher than the other k).

Analysis

library(ggplot2)
library(metagenomeSeq)

Load data

MRobj <- readRDS("../data/nasal_filtered_normed_batchcorrected.rds")
counts <- MRcounts(MRobj,norm=FALSE,log=FALSE)

Load previously computed results.

clust_2 <- readRDS("../output/gom-k2-best-seed-67.rds")
clust_3 <- readRDS("../output/gom-k3-best-seed-67.rds")
clust_4 <- readRDS("../output/gom-k4-best-seed-59.rds")
clust_5 <- readRDS("../output/gom-k5-best-seed-307.rds")
fits <- c(clust_2, clust_3, clust_4, clust_5)
names(fits) <- c("clust_2", "clust_3", "clust_4", "clust_5")

Extract sampel cluster membership.

membership <- lapply(fits, function(xx) {
     apply(xx$omega, 1, which.max)
  })
names(membership) <- c("clust_2", "clust_3", "clust_4", "clust_5")

Number of samples per cluster

table(membership$clust_2)


  1   2 
167  30

table(membership$clust_3)


  1   2   3 
116  67  14

table(membership$clust_4)


  1   2   3   4 
114  67  12   4

table(membership$clust_5)


 1  2  3  4  5 
97 69 20  7  4

Compute variance explained.

varprop_species <- lapply(1:length(membership), function(index) {
  which_sample <- which(colnames(counts) %in% names(membership[[index]]))
  counts_which <- counts[, which_sample]
  var <- sapply(1:nrow(counts_which), function(index2) {
            res <- lm(log2(counts_which[index2,]+2) ~ membership[[index]])
            sumOfsquares <- anova(res)[[2]]
            sumOfsquares[1]/sum(sumOfsquares) })
  return(var)
})
names(varprop_species) <- names(membership)
saveRDS(varprop_species, file = "../data/varprop.rds")

Plot variance explained

varprop <- readRDS("../data/varprop.rds")
varprop_long <- data.frame(
  prop = do.call(c, varprop),
  cluster = factor(rep(2:5, each = nrow(varprop))) )
varprop_long$cluster <- as.numeric(as.character(varprop_long$cluster))

K = 2 to 5

ggplot(varprop_long, aes(x = factor(cluster), y = prop,
                            col = factor(cluster)), size = .2) +
  geom_violin(trim = FALSE, size=.5) +
  geom_boxplot(width=0.1, size = .6) +
  labs(x = "Number of clusters", y = "Proportion variance explained")

Compare variances explained

Sig difference between K=2 vs K=4,5, but no sig. difference between K=2 and K=3.

wilcox.test(varprop$clust_2, varprop$clust_3)


    Wilcoxon rank sum test with continuity correction

data:  varprop$clust_2 and varprop$clust_3
W = 185720, p-value = 6.513e-10
alternative hypothesis: true location shift is not equal to 0

wilcox.test(varprop$clust_2, varprop$clust_4)


    Wilcoxon rank sum test with continuity correction

data:  varprop$clust_2 and varprop$clust_4
W = 183850, p-value = 5.653e-09
alternative hypothesis: true location shift is not equal to 0

wilcox.test(varprop$clust_2, varprop$clust_5)


    Wilcoxon rank sum test with continuity correction

data:  varprop$clust_2 and varprop$clust_5
W = 166400, p-value = 0.01104
alternative hypothesis: true location shift is not equal to 0

Consider within-sample diversity difference for when K=3.

library(data.table)
shannon_values <- fread("~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/shannon_values.tsv")

all.equal(shannon_values$V1, rownames(fits$clust_3$omega))

[1] TRUE

shannon_values$GOM <- sapply(1:nrow(fits$clust_3$omega), function(i) {
  which.max(fits$clust_3$omega[i,]) })

boxplot(shannon_values$V2~ shannon_values$GOM)

levels(shannon_values$GOM) <- c("Group 1", "Group 2", "Group 3")
library(ggplot2)
ggplot(shannon_values, aes(x=GOM, y = V2, fill = factor(GOM))) +
  geom_violin(col = "gray50") +
  scale_fill_manual(values = c(brewer.pal(12,"Set3")[1],
                                 brewer.pal(12,"Set3")[2],
                                 brewer.pal(12,"Set3")[3])) +
  geom_boxplot(width=.25) + ylab("Shannon diversity") +
  scale_x_discrete(name = "Cluster membership",
                   labels = c("Group 1", "Group 2", "Group 3")) +
  guides(fill=guide_legend(title="")) +
  ggtitle("Alpha diversity of K=3")

Consider within-sample diversity difference for when K=4.

library(data.table)
shannon_values <- fread("~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/shannon_values.tsv")

all.equal(shannon_values$V1, rownames(fits$clust_4$omega))

[1] TRUE

shannon_values$GOM <- sapply(1:nrow(fits$clust_4$omega), function(i) {
  which.max(fits$clust_4$omega[i,]) })

levels(shannon_values$GOM) <- c("Group 1", "Group 2", "Group 3", "Group 4")
library(ggplot2)
ggplot(shannon_values, aes(x=GOM, y = V2, fill = factor(GOM))) +
  geom_violin(col = "gray50") +
  scale_fill_manual(values = c(brewer.pal(12,"Set3")[1],
                                 brewer.pal(12,"Set3")[2],
                                 brewer.pal(12,"Set3")[3],
                                 brewer.pal(12,"Set3")[4])) +
  geom_boxplot(width=.25) + ylab("Shannon diversity") +
  scale_x_discrete(name = "Cluster membership",
                   labels = c("Group 1", "Group 2", "Group 3", "Group 4")) +
  guides(fill=guide_legend(title="")) +
  ggtitle("Alpha diversity of K=4")

Consider within-sample diversity difference for when K=5.

library(data.table)
shannon_values <- fread("~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/shannon_values.tsv")

all.equal(shannon_values$V1, rownames(fits$clust_5$omega))

[1] TRUE

shannon_values$GOM <- sapply(1:nrow(fits$clust_5$omega), function(i) {
  which.max(fits$clust_5$omega[i,]) })

boxplot(shannon_values$V2~ shannon_values$GOM)

levels(shannon_values$GOM) <- c("Group 1", "Group 2", "Group 3", "Group 4", "Group 5")
library(ggplot2)
ggplot(shannon_values, aes(x=GOM, y = V2, fill = factor(GOM))) +
  geom_violin(col = "gray50") +
  scale_fill_manual(values = c(brewer.pal(12,"Set3")[1],
                                 brewer.pal(12,"Set3")[2],
                                 brewer.pal(12,"Set3")[3],
                                 brewer.pal(12,"Set3")[4],
                                 brewer.pal(12,"Set3")[5])) +
  geom_boxplot(width=.25) + ylab("Shannon diversity") +
  scale_x_discrete(name = "Cluster membership",
                   labels = c("Group 1", "Group 2", "Group 3", "Group 4", "Group 5")) +
  guides(fill=guide_legend(title="")) +
  ggtitle("Alpha diversity of K=5")

Consider when K=2, the taxa that has high prob of variance explained

rownames(counts)[order(varprop$clust_2, decreasing = F)[1:10]]

 [1] "k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Hydrogenophilales;f__Hydrogenophilaceae;g__Thiobacillus;s__"
 [2] "k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Veillonella;s__"                    
 [3] "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardioidaceae;g__Friedmanniella;s__"       
 [4] "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Geodermatophilaceae;g__Modestobacter;s__"    
 [5] "k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales];f__Chitinophagaceae;g__Sediminibacterium;s__"     
 [6] "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Cellulomonadaceae;g__Cellulomonas;s__"       
 [7] "s__subflava"                                                                                                       
 [8] "k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Geobacillus;s__"                              
 [9] "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Klebsiella;s__" 
[10] "k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Comamonas;s__"

library(data.table)
de_results <- fread("~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/tbl.tsv")

all.equal(de_results$V1[match(rownames(counts), de_results$V1)], rownames(counts))

[1] TRUE

de_results <- de_results[match(rownames(counts), de_results$V1),]

plot(varprop$clust_2,-log10(de_results$adjPvalue),
     xlab = "Proportion of variance explained of K=2",
     ylab="-log10(adjusted P-value)",
     pch=16, cex=.7,
     col=(de_results$adjPvalue<.01)+1)
abline(h=2,lty=2,col="black")

de_results$varprop_clust2 <- varprop$clust_2

Append the variance proportion explained to the FDR results.

de_results <- de_results[order(de_results$adjPvalues),]
write.table(de_results,
            "~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/tbl_updated.tsv",
            quote=F, col.names = NA, sep="\t")

#tmp <- fread("~/Dropbox/current-projects/nasalMicrobiomeManuscript/draft-results/tbl_updated.tsv")

Session information

R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] data.table_1.11.0    metagenomeSeq_1.21.1 RColorBrewer_1.1-2  
 [4] glmnet_2.0-16        foreach_1.4.4        Matrix_1.2-14       
 [7] limma_3.34.9         Biobase_2.38.0       BiocGenerics_0.24.0 
[10] ggplot2_2.2.1       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.16       pillar_1.2.2       compiler_3.4.1    
 [4] git2r_0.21.0       plyr_1.8.4         bitops_1.0-6      
 [7] iterators_1.0.9    tools_3.4.1        digest_0.6.15     
[10] evaluate_0.10.1    tibble_1.4.2       gtable_0.2.0      
[13] lattice_0.20-35    rlang_0.2.0        yaml_2.1.19       
[16] stringr_1.3.0      knitr_1.20         caTools_1.17.1    
[19] gtools_3.5.0       rprojroot_1.3-2    grid_3.4.1        
[22] rmarkdown_1.9      gdata_2.18.0       magrittr_1.5      
[25] backports_1.1.2    scales_0.5.0       gplots_3.0.1      
[28] codetools_0.2-15   htmltools_0.3.6    matrixStats_0.53.1
[31] colorspace_1.3-2   labeling_0.3       KernSmooth_2.23-15
[34] stringi_1.2.2      lazyeval_0.2.1     munsell_0.4.3

This R Markdown site was created with workflowr