setwd("D:\\URF_viruses\\reply_peerj\\scripts_raw_data") ##This pipeline was assembled by Edgar E. Lara-Ramírez ##the adapted code sources are specified in the corresponding sections library("seqinr") library("Peptides") library("protr") library("adegenet") library("gplots") library("ComplexHeatmap") library("caret") library("randomForest") library("doParallel") library("FactoMineR") library("factoextra") library("cluster") library("plotly") library("Rtsne") library("umap") library("reshape2") library("ggplot2") require("gridExtra") library("gridGraphics") library("vcd") library("cowplot") library("RColorBrewer") suppressPackageStartupMessages(library(keras)) ##read alignment maftaln<- read.alignment("zika_aln.fasta", format ="fasta") ##extracting polymorphic aminoacids acpzika<-alignment2genind(maftaln) acpzikatab<-tab(acpzika) #extract pairwise amino acid frequencies coded in 0 and 1 sapply(acpzikatab, function(x) sum(is.na(x)))#countig NAs acpzikatab[is.na(acpzikatab)] <- 0 #replacing nas #setting colnames new_colnames <- make.names(gsub("\\.", "_", colnames(acpzikatab))) colnames(acpzikatab) <- new_colnames acpzikatab<-as.data.frame(acpzikatab) ###permutation # Function to permute the rows of each column permute_columns <- function(data) { permuted_data <- apply(data, 2, function(column) sample(column)) return(as.data.frame(permuted_data)) } # Create the synthetic dataset synthetic_data <- permute_columns(acpzikatab) # Verify the synthetic data head(synthetic_data) # Assign column names to synthetic data colnames(synthetic_data) <- colnames(acpzikatab) # Validate and compare synthetic_data with original_data summary(acpzikatab) summary(synthetic_data) # Add row labels to original_data and synthetic_data original_data <- cbind(acpzikatab, dataset_type = "original") synthetic_data <- cbind(synthetic_data, dataset_type = "synthetic") # Concatenate the datasets combined_data <- as.data.frame(rbind(original_data, synthetic_data)) combined_data$dataset_type=as.factor(combined_data$dataset_type) #Hyperparameter tunning #https://rpubs.com/phamdinhkhanh/389752 ##CUSTOM RF https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/ customRF <- list(type = "Classification", library = "randomForest", loop = NULL) customRF$parameters <- data.frame(parameter = c("mtry", "ntree"), class = rep("numeric", 2), label = c("mtry", "ntree")) customRF$grid <- function(x, y, len = NULL, search = "grid") {} customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs) { randomForest(x, y, mtry = param$mtry, ntree=param$ntree) } #Predict label customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata) #Predict prob customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata, type = "prob") customRF$sort <- function(x) x[order(x[,1]),] customRF$levels <- function(x) x$classes #create tunegrid tunegrid <- expand.grid(.mtry = c(10, 20, 40, 80, 160), .ntree = c(250, 500, 1000, 1500, 2000)) # Set up train control with 5-fold cross-validation ctrl <- trainControl(method='repeatedcv', number = 5, repeats=3, verboseIter = TRUE) metric<-"Accuracy" # Train the random forest model with tuning start.time <- Sys.time() set.seed(123) rf_gridsearch<- train(dataset_type ~ ., data = combined_data, method = customRF, metric=metric, tuneGrid = tunegrid, trControl = ctrl) print(rf_gridsearch) end_time <- Sys.time() execution_time <- end_time - start.time # Print the best model parameters print(rf_gridsearch$bestTune) pdf(file="Figure_S1_hyperparameters.pdf", width=8, height=7,paper="special") plot(rf_gridsearch) dev.off() ##constructing URF to validate by means of Out of bag modelRF_combined <- randomForest(dataset_type~., data=combined_data, mtry = 10, ntree = 1500, proximity = TRUE) #mtry 422/3 +1 ##Unsupervised random forest #this chunk of code was produced with help of the following web sources #https://gradientdescending.com/unsupervised-random-forest-example/ #https://stats.stackexchange.com/questions/72370/how-to-perform-unsupervised-random-forest-classification-using-breimans-code ## The tSNE, UMAP and AE algorithm uses random data, thus clustering patterns may vary from run to run ## see the following link for more details https://cran.r-project.org/web/packages/umap/vignettes/umap.html#stability-and-reproducibility start.time2 <- Sys.time() modelhost <- randomForest(acpzikatab, mtry = 10, ntree = 1500, proximity = TRUE) end_time2 <- Sys.time() execution_time2 <- end_time2 - start.time2 distaln.matrix <- as.dist(1-modelhost$proximity) cl= get_clust_tendency(as.matrix(distaln.matrix), n = 49, graph = T) protres <- read.table("Metadata_zika.txt", sep = "\t", quote="", header = TRUE, row.names = 1) length(protres) #Ensure row names are the same distaln2.matrix<-as.matrix(distaln.matrix) distaln2.df<-as.data.frame(distaln2.matrix) # Add row names as a column to both data frames distaln2.df$ID <- rownames(distaln2.df) protres$ID <- rownames(protres) distaln2.df<- merge(distaln2.df, protres, by="ID") rownames(distaln2.df) <- distaln2.df$ID distaln2.df$ID <- NULL # The Colorblind-friendly palettes style muted nine https://github.com/JLSteenwyk/ggpubfigs: cbPalette <- c("#332288", "#117733", "#CC6677", "#88CCEE", "#999933", "#882255", "#44AA99", "#DDCC77", "#AA4499") ##setting data for heatmap labeling ##setting data for heatmap labeling set.seed(123) segments2 <- c(distaln2.df$Region) segments3 <- c(distaln2.df$Host1) segment_labels2 <- matrix(segments2, ncol = 1) segment_labels3 <- matrix(segments3, ncol = 1) segment_colors2 <- c("Africa" = "#332288", "Asia" = "#117733", "Europe"= "#CC6677", "North_America" = "#88CCEE", "Oceania" = "#999933", "South_America" = "#882255" ) segment_colors3 <- c("Homo_sapiens" = "#44AA99", "Mosquito" = "#DDCC77", "Primate"= "#AA4499" ) colors2 <- segment_colors2[segment_labels2] colors3 <- segment_colors2[segment_labels3] combined_row_colors <- cbind(colors2, colors3) # Create row annotations row_ha <- rowAnnotation( Group1 = segment_labels2, Group2 = segment_labels3, col = list( Group1 = segment_colors2, Group2 = segment_colors3 ) ) # Plot the heatmap with row annotations pdf(file="Figure_2_heatmap_color.pdf", width=8, height=7,paper="special") Heatmap(as.matrix(distaln.matrix), name = "Proximity matrix", col =cbPalette, show_row_names = FALSE, # Omit row names show_column_names = FALSE, # Omit column names cluster_rows = TRUE, # Omit row dendrogram cluster_columns = TRUE, # Omit column dendrogram right_annotation = row_ha) dev.off() ##PCA first exploration # this code was adapted from the following web source #https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/ start.time3 <- Sys.time() zika.pca <- prcomp(distaln2.matrix, scale = FALSE) end_time3 <- Sys.time() execution_time3 <- end_time3 - start.time3 pca_scores <- as.data.frame(zika.pca$x) # Add row names as a column to both data frames pca_scores$ID <- rownames(pca_scores) protres$ID <- rownames(protres) pca_scores2<-pca_scores[,c(1:3,294)] pca_scores2<- merge(pca_scores2, protres, by="ID") rownames(pca_scores2) <- pca_scores2$ID pca_scores2$ID <- NULL #identification of number of clusters f3a<- fviz_nbclust(pca_scores2[,1:2], FUNcluster=kmeans, k.max = 8) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for PCA") # Apply k-means clustering set.seed(123) # For reproducibility #km_pca <- kmeans(pca_scores2[,1:2], centers = 3, nstart = 25) km_pca<-eclust(pca_scores2[,1:2], "kmeans", hc_metric="euclidean",k=8) f3b<- fviz_cluster(km_pca, data=pca_scores2[,1:2], geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("PCA Cluster Plot") #table(pca_scores2$Region, km_pca$cluster) ##Cluster statistics for k-means clustering #library(fpc) #library(clValid) # Compute pairwise-distance matrices #dd <- dist(pca_scores2[,1:2], method ="euclidean") # Statistics for k-means clustering #km_stats <- cluster.stats(dd, km_pca$cluster) #cat("Dunn Index:", km_stats, "\n") # (k-means) within clusters sum of squares #External validation by labeling #External validation by labeling f4a<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("PCA Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4b<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("PCA Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) #ggplotly(f4b) #for detailed exploration ##Creating and ploting tsne results set.seed(123) # for reproducibility start.time4 <- Sys.time() tsne <- Rtsne(distaln2.matrix, dims = 2, perplexity=50, PCA=FALSE, verbose=TRUE, max_iter = 1000, num_threads = 12) end_time4 <- Sys.time() execution_time4 <- end_time4 - start.time4 tsne_data <- as.data.frame(tsne$Y) rownames(tsne_data) <- rownames(distaln2.df) # Add row names as a column to both data frames tsne_data$ID <- rownames(tsne_data) tsne_data<- merge(tsne_data, protres, by="ID") rownames(tsne_data) <- tsne_data$ID tsne_data$ID <- NULL #identification of number of clusters f3c<- fviz_nbclust(tsne_data[,1:2], FUNcluster=kmeans, k.max = 8)+ labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for t-SNE") km_tsne<-eclust(tsne_data[,1:2], "kmeans", hc_metric="euclidean",k=4) f3d<- fviz_cluster(km_tsne, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("t-SNE Cluster Plot") table(tsne_data$Region, km_tsne$cluster) ##External validation by labeling f4c<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("t-SNE Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4d<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("t-SNE Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) #ggplotly(f4d) #ggplotly(f4d) ###UMAP dimensional reduction library(umap) start.time5<- start.time <- Sys.time() zika.umap <- umap(as.matrix(distaln2.matrix), input="dist") end_time5 <- Sys.time() execution_time5 <- end_time5 - start.time5 umap_data <- as.data.frame(zika.umap$layout) rownames(umap_data) <- rownames(umap_data) # Add row names as a column to both data frames umap_data$ID <- rownames(umap_data) umap_data<- merge(umap_data, protres, by="ID") rownames(umap_data) <- umap_data$ID umap_data$ID <- NULL #identification of number of clusters f3e<- fviz_nbclust(umap_data[,1:2], FUNcluster=kmeans, k.max = 8) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for UMAP") km_umap<-eclust(umap_data[,1:2], "kmeans", hc_metric="euclidean",k=6) f3f<- fviz_cluster(km_umap, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("UMAP Cluster Plot") table(umap_data$Region, km_umap$cluster) #External validation f4e<- ggplot(umap_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("UMAP Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4f<- ggplot(umap_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("UMAP Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) #ggplotly(f4f) # autoencoder in keras # this code was taken from the following web source #https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/ # set training data set.seed(42) # for reproducibility x_train <- as.matrix(distaln2.matrix) #back to 293 x 293 proximity # set model model <- keras_model_sequential() model %>% layer_dense(units = 12, activation = "tanh", input_shape = ncol(x_train)) %>% layer_dense(units = 6, activation = "tanh", name = "bottleneck") %>% layer_dense(units = 12, activation = "tanh") %>% layer_dense(units = ncol(x_train)) # view model layers summary(model) # compile model model %>% compile( loss = "mean_squared_error", optimizer = "adam" ) start.time6 <- Sys.time() # fit model model %>% fit( x = x_train, y = x_train, epochs = 2000, verbose = 0 ) end_time6 <- Sys.time() execution_time6 <- end_time6 - start.time6 # evaluate the performance of the model mse.ae2 <- evaluate(model, x_train, x_train) mse.ae2 # extract the bottleneck layer intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output) intermediate_output <- predict(intermediate_layer_model, x_train) ae_data <- as.data.frame(intermediate_output) rownames(ae_data) <- rownames(distaln2.df) # Add row names as a column to both data frames ae_data$ID <- rownames(ae_data) ae_data<- merge(ae_data, protres, by="ID") rownames(ae_data) <- ae_data$ID ae_data$ID <- NULL #cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # To use for fills, add #scale_fill_manual(values=cbPalette) # To use for line and point colors, add #scale_colour_manual(values=cbPalette) #identification of number of clusters f3g<- fviz_nbclust(ae_data[,1:2], FUNcluster=kmeans, k.max = 8) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for Autoencoders") km_ae<-eclust(ae_data[,1:2], "kmeans", hc_metric="euclidean",k=3) f3h<- fviz_cluster(km_ae, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal())+ labs(x = "Dim 1", y = "Dim 2") + ggtitle("Autoencoders Cluster Plot") #table(ae_data$Region, km_umap$cluster) #External validation f4g<- ggplot(ae_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("Autoencoders Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4h<- ggplot(ae_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("Autoencoders Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) #ggplotly(f4h) ##Figure3 pdf(file="Figure_3_cluster_1500t_2.pdf", width=8, height=10,paper="special") plot_grid(f3a, f3b, f3c, f3d, f3e, f3f, f3g, f3h, labels = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'), label_size = 12, ncol = 2) dev.off() ##figure4 pdf(file="Figure_4_label_1500t_2.pdf", width=8, height=10,paper="special") plot_grid(f4a, f4b, f4c, f4d, f4e, f4f, f4g, f4h, labels = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'), label_size = 12, ncol = 2) dev.off() ##Dimension data organization for phylosignal # Rename columns library(dplyr) library(tibble) library(purrr) #colnames(pca_scores2) <- paste0(colnames(pca_scores2), "_df1") colnames(tsne_data) <- paste0(colnames(tsne_data), "_df2") colnames(umap_data) <- paste0(colnames(umap_data), "_df3") colnames(ae_data) <- paste0(colnames(ae_data), "_df4") pca_scores2 <- rownames_to_column(pca_scores2, "rowname") tsne_data <- rownames_to_column(tsne_data, "rowname") umap_data <- rownames_to_column(umap_data, "rowname") ae_data <- rownames_to_column(ae_data, "rowname") merged_df <- reduce(list(pca_scores2, tsne_data, umap_data, ae_data), full_join, by = "rowname") merged_df <- column_to_rownames(merged_df, "rowname") # Select specific columns selected_columns <- c("PC1", "PC2", "V1_df2", "V2_df2", "V1_df3","V2_df3", "V1_df4", "V2_df4") # Create new data frame with selected columns new_df <- merged_df[, selected_columns] # Check the result head(new_df) # Save objects save(data, pca_result, umap_result, kmeans_result, umap_df, file = "my_objects1500.RData") # Save specific results as CSV and RDS write.csv(pca_scores2, file = "pca_scores21500.csv") write.csv(tsne_data, file = "tsne_data1500.csv") write.csv(umap_data, file = "umap_data1500.csv") write.csv(ae_data, file = "ae_data1500.csv") write.csv(new_df, file = "new_df1500.csv") saveRDS(modelhost, file= "modelhost1500.rds") saveRDS(zika.pca, file = "pca_result1500.rds") saveRDS(tsne, file = "tsne_result1500.rds") saveRDS(zika.umap, file = "umap_result1500.rds") saveRDS(intermediate_output, file = "ae_result1500.rds") # Save session information saveRDS(sessionInfo(), file = "session_info1500.rds") library("adephylo") library("phylobase") library("picante") library("phylosignal") badTree <- ape::read.tree(file="zikv_iqtree.treefile") t1_4d<-phylo4(badTree, check.node.labels="drop") p4d <- phylo4d(badTree, new_df) #barplot.phylo4d(p4d, tree.type = "phylo", tree.ladderize = TRUE, show.tip = FALSE) #dotplot.phylo4d(p4d, tree.type = "phylo", tree.ladderize = TRUE, show.tip = FALSE) #gridplot.phylo4d(p4d, tree.type = "phylo", tree.ladderize = TRUE, show.tip = FALSE) phyloSignal(p4d = p4d, method = "Cmean") #Abouheif’s method phyloSignal(p4d = p4d, method = "Lambda") #Lambda method start.time7 <- Sys.time() PCA_Ax <- phyloCorrelogram(p4d[,1:2]) tSNE_Ax <- phyloCorrelogram(p4d[,3:4]) UMAP_Ax <- phyloCorrelogram(p4d[,5:6]) AE_Ax <- phyloCorrelogram(p4d[,7:8]) end_time7 <- Sys.time() execution_time7 <- end_time7 - start.time7 pdf(file="Figure_7_phylo_final_2.pdf", width=8, height=10,paper="special") par(mfrow=c(2,2), mar = c(1, 1, 1, 1), asp = 1.5) plot(PCA_Ax, main="PCA phylogenetic correlogram") legend("topleft", legend=c("A"), cex=0.8, box.lty=0) plot(tSNE_Ax, main="tSNE phylogenetic correlogram") legend("topleft", legend=c("B"), cex=0.8, box.lty=0) plot(UMAP_Ax, main="UMAP phylogenetic correlogram") legend("topleft", legend=c("C"), cex=0.8, box.lty=0) plot(AE_Ax, main= "Autoencoders phylogenetic correlogram") legend("topleft", legend=c("D"), cex=0.8, box.lty=0) dev.off()