# libraries for sparse CCA library(PMA) library(ade4) library(genefilter) library(gtools) library(ggplot2) library(ggrepel) library(textshape) library(DESeq2) # libraries for sparse CCA library(PMA) library(ade4) library(genefilter) library(gtools) library(ggplot2) library(ggrepel) library(textshape) library(DESeq2) # import ASV table (eg "table_IDs_sterivex") microbe <- column_to_rownames(table_3, 'ID') # import NMR data # convert bins to rownames NMR <- column_to_rownames(Preprocessed_NMR_three, 'NMRBin') # convert all to matrices microbe <- as.matrix(microbe) NMR <- as.matrix(NMR) #### Sparse CCA ############# # filter out taxa that are 0 over multiple samples X = microbe[rowSums(microbe == 0) <= 6, ] Y = NMR[rowSums(NMR == 0) <= 6, ] # transform ASV table # option 1: log transform X = log(1 + X, base=10) Y = log(1 + Y, base=10) # option 2: vst (microbes only) X <- varianceStabilizingTransformation(X, blind = TRUE, fitType = "mean") Y <- scale(Y) # order columns so they match X <- X[ , mixedsort(colnames(X))] Y <- Y[ , mixedsort(colnames(Y))] # look for association between the two matrices using ade4 package pca1 = dudi.pca(t(Y), scal = TRUE, scann = FALSE) pca2 = dudi.pca(t(X), scal = TRUE, scann = FALSE) rv1 = RV.rtest(pca1$tab, pca2$tab, 999) # sparse CCA ccaRes = CCA(t(X), t(Y), penaltyx = 0.2, penaltyz = 0.05) ccaRes # to plot PCA triplot # extract significant taxa and metabolites sig_taxa <- rownames(X[ccaRes$u != 0, ]) sig_metab <- rownames(Y[ccaRes$v != 0, ]) # combine these for column name list columns <- c(sig_taxa, sig_metab) combined <- cbind(t(X[ccaRes$u != 0, ]), t(Y[ccaRes$v != 0, ])) colnames(combined) <- columns pca_res <- dudi.pca(combined, scannf = F, nf = 3) site <- rownames(pca_res$li) feature_type <- grepl("[a-z]", colnames(combined)) feature_type <- ifelse(feature_type, "ASV", "Metabolite") sample_info <- data.frame(pca_res$li, site) feature_info <- data.frame(pca_res$c1, feature = feature_type) ggplot() + geom_point(data = sample_info, aes(x = Axis1, y = Axis2, col = factor(mixedsort(as.numeric(site)))), size = 3) + geom_point(data = feature_info,aes(x = 10 * CS1, y = 10 * CS2, fill = feature), size = 2, shape = 23) + scale_color_brewer(palette = "Spectral") + scale_fill_manual(values = c("#a6d854", "#e78ac3")) + coord_fixed(sqrt(pca_res$eig[2] / pca_res$eig[2])) + labs(x = sprintf("Axis1 [%s%% Variance]", 100 * round(pca_res$eig[1] / sum(pca_res$eig), 2)), y = sprintf("Axis2 [%s%% Variance]", 100 * round(pca_res$eig[2] / sum(pca_res$eig), 2)), fill="Feature Type", col="Site") + theme_classic() # for labels geom_label_repel(data = feature_info, aes(x = 10 * CS1, y = 10 * CS2, label = rownames(feature_info), fill = feature_type), size = 2, segment.size = 0.3, label.padding = unit(0.1, "lines"), label.size = 0) + # save significant features as CSVs write.csv(sig_taxa, file="sCCA_vst_nmrscale_3_sigtaxa.csv") write.csv(sig_metab, file="sCCA_vst_nmrscale_3_metabs.csv")