Applying GoM to HMP data

Summary

Objectives
- Applying GoM to HMP data.
- Compare K=2-5 GoM profile and Taxa profile with that in our data.
Data curated by JNP. (data/anterior_nares.rds received on 2019/04/09)
Results
- HMP anterio nares data 146 samples and 5,130 OTUs. The data is summarized at the species level.
- PCA analysis shows expression variation (first PC) is associated with sequencing run center. We thus fitted a linear model on the sample read counts and removed sample differences due to sequencing run center.
- Cluster analysis (GoM) K=2-5.

Preprocessing

Filtering

library(metagenomeSeq)
library(CountClust)

library(metagenomeSeq)

MRobj <- readRDS("data/anterior_nares.rds")

# filtering ------
MRobj_count <- MRcounts(MRobj, norm=F, log=F)

# keep features present (>0 read) in at least 1% of the samples
keep_rowmeans <- rowSums(MRobj_count > 0) >= ncol(MRobj_count)*.01

# keep features > 5 read in at least one sample
summary(c(MRobj_count))
keep_reads <- rowSums(MRobj_count > 5) > 0

keep_features <- keep_rowmeans | keep_reads

# filtering data
MRobj_filtered <- MRobj[keep_features,]

# filter sample
keep_sample <- colMeans(MRcounts(MRobj_filtered)>0) > .01
MRobj_filtered <- MRobj_filtered[,keep_sample]

dim(MRobj_filtered)

Batch correction: Correcr for run center effect

table(pData(MRobj_filtered)$RUNCENTER)

pca_res <- prcomp(t(MRcounts(MRobj_filtered, log = T)), scale=T)

head((pca_res$sdev^2)/sum(pca_res$sdev^2))

plot(pca_res$x[,1],pca_res$x[,2],
     col=as.integer(pData(MRobj_filtered)$visitno))
plot(pca_res$x[,1],pca_res$x[,2],
     col=as.integer(pData(MRobj_filtered)$RUNCENTER))

for (i in 1:2) {
  print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$visitno))))
}
for (i in 1:2) {
  print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$RUNCENTER))))
}
for (i in 1:2) {
  print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$sex))))
}

Output filtered, normalized, batch-corrected counts.

obj <- MRobj_filtered
obj@expSummary$expSummary$normFactors <- normFactors(MRobj_filtered)

counts <- t(MRcounts(obj,norm=FALSE,log=FALSE))
batchcorrect_counts <- BatchCorrectedCounts(counts,
                                         batch_lab = factor(pData(MRobj_filtered)$RUNCENTER),
                                         use_parallel=FALSE)
counts <- t(batchcorrect_counts)

assayData(obj)[["counts"]] = counts


saveRDS(obj, file="output/hmp.Rmd/hmp_updated_processed.rds")

GoM clustering

Try 100 random seeds.

Compare counts after library size normalization only and after cumulative normalization + library size normalization.

obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")

# library size normalization via CountClust ---------------------------
library(CountClust)
# fit GoM
#MRobj_species <- aggTax(obj,lvl="species")
counts <- MRcounts(obj,norm=FALSE,log=FALSE)
clust_2 <- FitGoM(t(counts), K=c(2), tol=100, num_trials = 100)
clust_3 <- FitGoM(t(counts), K=c(3), tol=100, num_trials = 100)
clust_4 <- FitGoM(t(counts), K=c(4), tol=100, num_trials = 100)
clust_5 <- FitGoM(t(counts), K=c(5), tol=100, num_trials = 100)

fits <- list(clust_2$fit, clust_3$fit, clust_4$fit, clust_5$fit)
membership <- lapply(fits, function(xx) {
     apply(xx$omega, 1, which.max)
  })
names(membership) <- paste0("clust_", c(2:5))

saveRDS(fits, file = "output/hmp.Rmd/hmp_updated_clusters.rds")
saveRDS(membership, file = "output/hmp.Rmd/hmp_updated_clusters_membership.rds")



# cumNorm plus library size normalization ----------------------------
library(CountClust)
# fit GoM
#MRobj_species <- aggTax(obj,lvl="species")
obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")
counts <- MRcounts(obj,norm=T,log=FALSE)
clust_2 <- FitGoM(t(counts), K=c(2), tol=100, num_trials = 100)
clust_3 <- FitGoM(t(counts), K=c(3), tol=100, num_trials = 100)
clust_4 <- FitGoM(t(counts), K=c(4), tol=100, num_trials = 100)
clust_5 <- FitGoM(t(counts), K=c(5), tol=100, num_trials = 100)

fits <- list(clust_2$fit, clust_3$fit, clust_4$fit, clust_5$fit)
membership <- lapply(fits, function(xx) {
     apply(xx$omega, 1, which.max)
  })
names(membership) <- paste0("clust_", c(2:5))

saveRDS(fits, file = "output/hmp.Rmd/hmp_updated_clusters_norm2.rds")
saveRDS(membership, file = "output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")

Compare results between the two normalization.

membership <- readRDS(file = "output/hmp.Rmd/hmp_updated_clusters_membership.rds")
membership_norm2 <- readRDS(file = "output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")

table(membership$clust_2, membership_norm2$clust_2)

GoM results

fits <- readRDS("../output/hmp.Rmd/hmp_updated_clusters_norm2.rds")
membership <- readRDS("../output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")

pdf("output/hmp.Rmd/hmp_updated_clusts_structure_norm2.pdf")
for (i in 1:length(fits)) {
  annotation <- data.frame(
  sample_id = paste0("X", c(1:NROW(fits[[i]]$omega))),
  tissue_label = factor(membership[[i]],
                        levels = as.character(1:(i+1)) ) ) 
  rownames(fits[[i]]$omega) <- annotation$sample_id
  print(StructureGGplot(omega = fits[[i]]$omega,
                  annotation = annotation,
                  palette = RColorBrewer::brewer.pal(i+1, "Accent"),
                  yaxis_label = "Group",
                  order_sample = TRUE,
                  axis_tick = list(axis_ticks_length = .1,
                                   axis_ticks_lwd_y = .1,
                                   axis_ticks_lwd_x = .1,
                                   axis_label_size = 7,
                                   axis_label_face = "bold")) )
}
dev.off()

Taxonomical profile

library(reshape2)

obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")
membership <- readRDS("output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")

# code from code/figs-paulson-04212018.R
pdf("output/hmp.Rmd/hmp_clusts_taxa.pdf")

pp <- plotBar(obj,lvl='Genus',cl=membership$clust_2,n=20)

# aggregated barplot
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- getPalette(21)
hmp_fig1 <- plotBar.aggregate(obj[,order(colnames(obj))],lvl='Genus',n=20,
                  plot.factor_levels = levels(pp$dd$variable))
hmp_fig1$p + scale_fill_manual(values = cols) + xlab('Samples') +
  guides(fill=guide_legend(ncol=1)) +
  theme(legend.position = "none")
hmp_fig1$p + scale_fill_manual(values = cols) + xlab('Samples') +
  guides(fill=guide_legend(ncol=1)) 
write.csv(hmp_fig1$dd,
            file= "output/hmp.Rmd/hmp_prop_agg.csv",
            row.names=F)

# barplot by cluster membership
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- getPalette(21)
hmp_fig2 <- plotBar(obj,lvl='Genus',cl=membership$clust_2,n=20)
hmp_fig2$p + scale_fill_manual(values = getPalette(21)) +
  guides(fill=guide_legend(ncol=1)) + 
  theme(legend.position = "none")
write.csv(hmp_fig2$dd,
            file= "output/hmp.Rmd/hmp_prop_agg_clust2.csv",
            row.names=F)


dev.off()

Session information

sessionInfo()

## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] CountClust_1.6.2     ggplot2_3.1.0        metagenomeSeq_1.21.1
##  [4] RColorBrewer_1.1-2   glmnet_2.0-16        foreach_1.4.4       
##  [7] Matrix_1.2-15        limma_3.34.9         Biobase_2.38.0      
## [10] BiocGenerics_0.24.0 
## 
## loaded via a namespace (and not attached):
##  [1] maptpx_1.9-5       Rcpp_1.0.0         ape_5.2           
##  [4] lattice_0.20-38    gtools_3.8.1       assertthat_0.2.0  
##  [7] rprojroot_1.3-2    digest_0.6.18      slam_0.1-43       
## [10] R6_2.3.0           plyr_1.8.4         backports_1.1.2   
## [13] stats4_3.4.1       evaluate_0.12      pillar_1.3.0      
## [16] gplots_3.0.1       rlang_0.3.0.1      lazyeval_0.2.1    
## [19] gdata_2.18.0       vegan_2.5-3        rmarkdown_1.10    
## [22] Rtsne_0.15         stringr_1.3.1      munsell_0.5.0     
## [25] compiler_3.4.1     pkgconfig_2.0.2    SQUAREM_2017.10-1 
## [28] mgcv_1.8-25        htmltools_0.3.6    nnet_7.3-12       
## [31] tidyselect_0.2.5   tibble_1.4.2       codetools_0.2-15  
## [34] matrixStats_0.54.0 mapplots_1.5.1     permute_0.9-4     
## [37] crayon_1.3.4       dplyr_0.7.8        withr_2.1.2       
## [40] MASS_7.3-51.1      bitops_1.0-6       grid_3.4.1        
## [43] nlme_3.1-137       gtable_0.2.0       magrittr_1.5      
## [46] scales_1.0.0       KernSmooth_2.23-15 stringi_1.2.4     
## [49] reshape2_1.4.3     flexmix_2.3-14     bindrcpp_0.2.2    
## [52] boot_1.3-20        cowplot_0.9.3      iterators_1.0.10  
## [55] tools_3.4.1        picante_1.7        glue_1.3.0        
## [58] purrr_0.2.5        yaml_2.2.0         colorspace_1.3-2  
## [61] cluster_2.0.7-1    caTools_1.17.1.1   knitr_1.20        
## [64] bindr_0.1.1        modeltools_0.2-22

This R Markdown site was created with workflowr

Applying GoM to HMP data

Joyce Hsiao

Summary

Preprocessing

GoM clustering

Session information