Data curated by JNP. (data/anterior_nares.rds received on 2019/04/09)
Filtering
library(metagenomeSeq)
library(CountClust)
library(metagenomeSeq)
MRobj <- readRDS("data/anterior_nares.rds")
# filtering ------
MRobj_count <- MRcounts(MRobj, norm=F, log=F)
# keep features present (>0 read) in at least 1% of the samples
keep_rowmeans <- rowSums(MRobj_count > 0) >= ncol(MRobj_count)*.01
# keep features > 5 read in at least one sample
summary(c(MRobj_count))
keep_reads <- rowSums(MRobj_count > 5) > 0
keep_features <- keep_rowmeans | keep_reads
# filtering data
MRobj_filtered <- MRobj[keep_features,]
# filter sample
keep_sample <- colMeans(MRcounts(MRobj_filtered)>0) > .01
MRobj_filtered <- MRobj_filtered[,keep_sample]
dim(MRobj_filtered)
Batch correction: Correcr for run center effect
table(pData(MRobj_filtered)$RUNCENTER)
pca_res <- prcomp(t(MRcounts(MRobj_filtered, log = T)), scale=T)
head((pca_res$sdev^2)/sum(pca_res$sdev^2))
plot(pca_res$x[,1],pca_res$x[,2],
col=as.integer(pData(MRobj_filtered)$visitno))
plot(pca_res$x[,1],pca_res$x[,2],
col=as.integer(pData(MRobj_filtered)$RUNCENTER))
for (i in 1:2) {
print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$visitno))))
}
for (i in 1:2) {
print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$RUNCENTER))))
}
for (i in 1:2) {
print(summary(lm(pca_res$x[,i] ~ as.factor(pData(MRobj_filtered)$sex))))
}
Output filtered, normalized, batch-corrected counts.
obj <- MRobj_filtered
obj@expSummary$expSummary$normFactors <- normFactors(MRobj_filtered)
counts <- t(MRcounts(obj,norm=FALSE,log=FALSE))
batchcorrect_counts <- BatchCorrectedCounts(counts,
batch_lab = factor(pData(MRobj_filtered)$RUNCENTER),
use_parallel=FALSE)
counts <- t(batchcorrect_counts)
assayData(obj)[["counts"]] = counts
saveRDS(obj, file="output/hmp.Rmd/hmp_updated_processed.rds")
Try 100 random seeds.
Compare counts after library size normalization only and after cumulative normalization + library size normalization.
obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")
# library size normalization via CountClust ---------------------------
library(CountClust)
# fit GoM
#MRobj_species <- aggTax(obj,lvl="species")
counts <- MRcounts(obj,norm=FALSE,log=FALSE)
clust_2 <- FitGoM(t(counts), K=c(2), tol=100, num_trials = 100)
clust_3 <- FitGoM(t(counts), K=c(3), tol=100, num_trials = 100)
clust_4 <- FitGoM(t(counts), K=c(4), tol=100, num_trials = 100)
clust_5 <- FitGoM(t(counts), K=c(5), tol=100, num_trials = 100)
fits <- list(clust_2$fit, clust_3$fit, clust_4$fit, clust_5$fit)
membership <- lapply(fits, function(xx) {
apply(xx$omega, 1, which.max)
})
names(membership) <- paste0("clust_", c(2:5))
saveRDS(fits, file = "output/hmp.Rmd/hmp_updated_clusters.rds")
saveRDS(membership, file = "output/hmp.Rmd/hmp_updated_clusters_membership.rds")
# cumNorm plus library size normalization ----------------------------
library(CountClust)
# fit GoM
#MRobj_species <- aggTax(obj,lvl="species")
obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")
counts <- MRcounts(obj,norm=T,log=FALSE)
clust_2 <- FitGoM(t(counts), K=c(2), tol=100, num_trials = 100)
clust_3 <- FitGoM(t(counts), K=c(3), tol=100, num_trials = 100)
clust_4 <- FitGoM(t(counts), K=c(4), tol=100, num_trials = 100)
clust_5 <- FitGoM(t(counts), K=c(5), tol=100, num_trials = 100)
fits <- list(clust_2$fit, clust_3$fit, clust_4$fit, clust_5$fit)
membership <- lapply(fits, function(xx) {
apply(xx$omega, 1, which.max)
})
names(membership) <- paste0("clust_", c(2:5))
saveRDS(fits, file = "output/hmp.Rmd/hmp_updated_clusters_norm2.rds")
saveRDS(membership, file = "output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")
Compare results between the two normalization.
membership <- readRDS(file = "output/hmp.Rmd/hmp_updated_clusters_membership.rds")
membership_norm2 <- readRDS(file = "output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")
table(membership$clust_2, membership_norm2$clust_2)
GoM results
fits <- readRDS("../output/hmp.Rmd/hmp_updated_clusters_norm2.rds")
membership <- readRDS("../output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")
pdf("output/hmp.Rmd/hmp_updated_clusts_structure_norm2.pdf")
for (i in 1:length(fits)) {
annotation <- data.frame(
sample_id = paste0("X", c(1:NROW(fits[[i]]$omega))),
tissue_label = factor(membership[[i]],
levels = as.character(1:(i+1)) ) )
rownames(fits[[i]]$omega) <- annotation$sample_id
print(StructureGGplot(omega = fits[[i]]$omega,
annotation = annotation,
palette = RColorBrewer::brewer.pal(i+1, "Accent"),
yaxis_label = "Group",
order_sample = TRUE,
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 7,
axis_label_face = "bold")) )
}
dev.off()
Taxonomical profile
library(reshape2)
obj <- readRDS(file="output/hmp.Rmd/hmp_updated_processed.rds")
membership <- readRDS("output/hmp.Rmd/hmp_updated_clusters_membership_norm2.rds")
# code from code/figs-paulson-04212018.R
pdf("output/hmp.Rmd/hmp_clusts_taxa.pdf")
pp <- plotBar(obj,lvl='Genus',cl=membership$clust_2,n=20)
# aggregated barplot
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- getPalette(21)
hmp_fig1 <- plotBar.aggregate(obj[,order(colnames(obj))],lvl='Genus',n=20,
plot.factor_levels = levels(pp$dd$variable))
hmp_fig1$p + scale_fill_manual(values = cols) + xlab('Samples') +
guides(fill=guide_legend(ncol=1)) +
theme(legend.position = "none")
hmp_fig1$p + scale_fill_manual(values = cols) + xlab('Samples') +
guides(fill=guide_legend(ncol=1))
write.csv(hmp_fig1$dd,
file= "output/hmp.Rmd/hmp_prop_agg.csv",
row.names=F)
# barplot by cluster membership
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- getPalette(21)
hmp_fig2 <- plotBar(obj,lvl='Genus',cl=membership$clust_2,n=20)
hmp_fig2$p + scale_fill_manual(values = getPalette(21)) +
guides(fill=guide_legend(ncol=1)) +
theme(legend.position = "none")
write.csv(hmp_fig2$dd,
file= "output/hmp.Rmd/hmp_prop_agg_clust2.csv",
row.names=F)
dev.off()
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] CountClust_1.6.2 ggplot2_3.1.0 metagenomeSeq_1.21.1
## [4] RColorBrewer_1.1-2 glmnet_2.0-16 foreach_1.4.4
## [7] Matrix_1.2-15 limma_3.34.9 Biobase_2.38.0
## [10] BiocGenerics_0.24.0
##
## loaded via a namespace (and not attached):
## [1] maptpx_1.9-5 Rcpp_1.0.0 ape_5.2
## [4] lattice_0.20-38 gtools_3.8.1 assertthat_0.2.0
## [7] rprojroot_1.3-2 digest_0.6.18 slam_0.1-43
## [10] R6_2.3.0 plyr_1.8.4 backports_1.1.2
## [13] stats4_3.4.1 evaluate_0.12 pillar_1.3.0
## [16] gplots_3.0.1 rlang_0.3.0.1 lazyeval_0.2.1
## [19] gdata_2.18.0 vegan_2.5-3 rmarkdown_1.10
## [22] Rtsne_0.15 stringr_1.3.1 munsell_0.5.0
## [25] compiler_3.4.1 pkgconfig_2.0.2 SQUAREM_2017.10-1
## [28] mgcv_1.8-25 htmltools_0.3.6 nnet_7.3-12
## [31] tidyselect_0.2.5 tibble_1.4.2 codetools_0.2-15
## [34] matrixStats_0.54.0 mapplots_1.5.1 permute_0.9-4
## [37] crayon_1.3.4 dplyr_0.7.8 withr_2.1.2
## [40] MASS_7.3-51.1 bitops_1.0-6 grid_3.4.1
## [43] nlme_3.1-137 gtable_0.2.0 magrittr_1.5
## [46] scales_1.0.0 KernSmooth_2.23-15 stringi_1.2.4
## [49] reshape2_1.4.3 flexmix_2.3-14 bindrcpp_0.2.2
## [52] boot_1.3-20 cowplot_0.9.3 iterators_1.0.10
## [55] tools_3.4.1 picante_1.7 glue_1.3.0
## [58] purrr_0.2.5 yaml_2.2.0 colorspace_1.3-2
## [61] cluster_2.0.7-1 caTools_1.17.1.1 knitr_1.20
## [64] bindr_0.1.1 modeltools_0.2-22
This R Markdown site was created with workflowr