Applying GoM to HMP data

Summary

Apply hierarchical clustering to cluster samples
Try different distance metrics
- Euclidean on logged and normed counts
- Center log ratio transform
- Bray-Curtis on sample proportions

Euclidean on logged and normed counts

On logged and normed counts. Complete and average distance give the same results.

library(metagenomeSeq)
MRobj <- readRDS("../data/nasal_filtered_normed_batchcorrected.rds")
counts_logged <- MRcounts(MRobj,norm=T,log=T)
colnames(counts_logged) <- colnames(MRobj)

dist_mat <- dist(t(counts_logged), method="euclidean")
hclust_avg <- hclust(dist_mat, method = 'complete')
cut_avg_1 <- cutree(hclust_avg, k = 2)
table(cut_avg_1)

## cut_avg_1
##   1   2 
## 190   7

plot(hclust_avg)
rect.hclust(hclust_avg , k = 2, border = c(2,4))

#saveRDS(cut_avg_1, file = "output/cluster-alternative-methods.Rmd/cut_res_1.rds")

Total tax count and cluster membership

plot(cut_avg_1, colSums(MRcounts(MRobj,norm=T,log=F)))

t.test(colSums(MRcounts(MRobj,norm=T,log=F)) ~ cut_avg_1)

## 
##  Welch Two Sample t-test
## 
## data:  colSums(MRcounts(MRobj, norm = T, log = F)) by cut_avg_1
## t = 1.1585, df = 25.818, p-value = 0.2573
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -21945.83  78589.22
## sample estimates:
## mean in group 1 mean in group 2 
##        142283.4        113961.7

Center log ratio transform

On normed counts.

library(metagenomeSeq)
MRobj <- readRDS("../data/nasal_filtered_normed_batchcorrected.rds")
counts <- MRcounts(MRobj,norm=T,log=F)
colnames(counts) <- colnames(MRobj)

library(compositions)
counts_clr <- t(counts)

# use cutree to determine sample cluster membership
dist_mat <- dist(counts_clr, method="euclidean")
hclust_avg <- hclust(dist_mat, method = 'complete')
cut_avg_2 <- cutree(hclust_avg, k = 2)
table(cut_avg_2)

## cut_avg_2
##   1   2 
## 196   1

plot(hclust_avg)
rect.hclust(hclust_avg , k = 2, border = c(2,4))

#saveRDS(cut_avg_2, file = "output/cluster-alternative-methods.Rmd/cut_res_2.rds")

Bray-Curtis on sample proportions

library(metagenomeSeq)
MRobj <- readRDS("../data/nasal_filtered_normed_batchcorrected.rds")
counts <- MRcounts(MRobj,norm=T,log=F)
colnames(counts) <- colnames(MRobj)

library(vegan)
mat_prop <- t(t(counts)/colSums(counts))
beta_bray <- vegdist(t(mat_prop), method = "bray", upper=T)

# use cutree to determine sample cluster membership
#dist_mat <- dist(t(counts_clr), method="euclidean")
hclust_avg <- hclust(beta_bray, method = 'complete')
cut_avg_3 <- cutree(hclust_avg, k = 2)
table(cut_avg_3)

## cut_avg_3
##   1   2 
## 161  36

plot(hclust_avg)
rect.hclust(hclust_avg , k = 2, border = c(2,4))

#saveRDS(cut_avg_3, file = "output/cluster-alternative-methods.Rmd/cut_res_3.rds")

Output hierarchical clustereing results.

cut_avg_1 <- readRDS(file = "../output/cluster-alternative-methods.Rmd/cut_res_1.rds")
cut_avg_2 <- readRDS(file = "../output/cluster-alternative-methods.Rmd/cut_res_2.rds")
cut_avg_3 <- readRDS(file = "../output/cluster-alternative-methods.Rmd/cut_res_3.rds")

all.equal(names(cut_avg_1), names(cut_avg_2))

## [1] TRUE

all.equal(names(cut_avg_1), names(cut_avg_3))

## [1] TRUE

df <- data.frame(sample=names(cut_avg_1),
                 method1=cut_avg_1,
                 method2=cut_avg_2,
                 method3=cut_avg_3)

# write.csv(df,
#             file= "../output-manuscript/supp-clusterig.csv",
#             quote=F, row.names=F)

table(df$method1)

## 
##   1   2 
## 190   7

table(df$method2)

## 
##   1   2 
## 196   1

table(df$method3)

## 
##   1   2 
## 161  36

table(df$method1, df$method3)

##    
##       1   2
##   1 154  36
##   2   7   0

table(df$method1, df$method3)

##    
##       1   2
##   1 154  36
##   2   7   0

obj <- readRDS("../data/nasal_GOM.rds")
gom <- pData(obj)$GOM
names(gom) <- rownames(pData(obj))

table(gom, df$method1)

##    
## gom   1   2
##   1 167   0
##   2  23   7

table(gom, df$method2)

##    
## gom   1   2
##   1 167   0
##   2  29   1

table(gom, df$method3)

##    
## gom   1   2
##   1 132  35
##   2  29   1

Session information

sessionInfo()

## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] vegan_2.5-3          lattice_0.20-38      permute_0.9-4       
##  [4] compositions_1.40-2  bayesm_3.1-0.1       energy_1.7-5        
##  [7] robustbase_0.93-3    tensorA_0.36.1       metagenomeSeq_1.21.1
## [10] RColorBrewer_1.1-2   glmnet_2.0-16        foreach_1.4.4       
## [13] Matrix_1.2-15        limma_3.34.9         Biobase_2.38.0      
## [16] BiocGenerics_0.24.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0         compiler_3.4.1     DEoptimR_1.0-8    
##  [4] bitops_1.0-6       iterators_1.0.10   tools_3.4.1       
##  [7] boot_1.3-20        digest_0.6.18      nlme_3.1-137      
## [10] evaluate_0.12      mgcv_1.8-25        yaml_2.2.0        
## [13] cluster_2.0.7-1    stringr_1.3.1      knitr_1.20        
## [16] gtools_3.8.1       caTools_1.17.1.1   rprojroot_1.3-2   
## [19] grid_3.4.1         rmarkdown_1.10     gdata_2.18.0      
## [22] magrittr_1.5       MASS_7.3-51.1      backports_1.1.2   
## [25] gplots_3.0.1       codetools_0.2-15   matrixStats_0.54.0
## [28] htmltools_0.3.6    KernSmooth_2.23-15 stringi_1.2.4

This R Markdown site was created with workflowr

Applying GoM to HMP data

Joyce Hsiao

Summary

Euclidean on logged and normed counts

Center log ratio transform

Bray-Curtis on sample proportions

Output hierarchical clustereing results.

Session information