setwd("D:\\URF_viruses\\zika_virus2023\\Dengue\\Den2\\analisis") #setwd("/home/edgar/Escritorio/zika_virus2023/Analisis/aln_datos") ##This pipeline was assembled by Edgar E. Lara-Ramírez ##the adapted code sources are specified in the corresponding sections library("seqinr") library("Peptides") library("protr") library("adegenet") library("gplots") library("ComplexHeatmap") library("caret") library("randomForest") library("doParallel") library("FactoMineR") library("factoextra") library("cluster") library("plotly") library("Rtsne") library("umap") library("reshape2") library("ggplot2") require("gridExtra") library("gridGraphics") library("vcd") library("cowplot") suppressPackageStartupMessages(library(keras)) ##read alignment maftaln<- read.alignment("DENV2_aln.fasta", format ="fasta") ##extracting polymorphic aminoacids acpzika<-alignment2genind(maftaln) acpzikatab<-tab(acpzika) #extract pairwise amino acid frequencies coded in 0 and 1 sapply(acpzikatab, function(x) sum(is.na(x)))#countig NAs acpzikatab[is.na(acpzikatab)] <- 0 #replacing nas #setting colnames new_colnames <- make.names(gsub("\\.", "_", colnames(acpzikatab))) colnames(acpzikatab) <- new_colnames acpzikatab<-as.data.frame(acpzikatab) ###permutation # Function to permute the rows of each column permute_columns <- function(data) { permuted_data <- apply(data, 2, function(column) sample(column)) return(as.data.frame(permuted_data)) } # Create the synthetic dataset synthetic_data <- permute_columns(acpzikatab) # Verify the synthetic data head(synthetic_data) # Assign column names to synthetic data colnames(synthetic_data) <- colnames(acpzikatab) # Validate and compare synthetic_data with original_data summary(acpzikatab) summary(synthetic_data) # Add row labels to original_data and synthetic_data original_data <- cbind(acpzikatab, dataset_type = "original") synthetic_data <- cbind(synthetic_data, dataset_type = "synthetic") # Concatenate the datasets combined_data <- as.data.frame(rbind(original_data, synthetic_data)) combined_data$dataset_type=as.factor(combined_data$dataset_type) #Hyperparameter tunning #https://rpubs.com/phamdinhkhanh/389752 ##CUSTOM RF https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/ customRF <- list(type = "Classification", library = "randomForest", loop = NULL) customRF$parameters <- data.frame(parameter = c("mtry", "ntree"), class = rep("numeric", 2), label = c("mtry", "ntree")) customRF$grid <- function(x, y, len = NULL, search = "grid") {} customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs) { randomForest(x, y, mtry = param$mtry, ntree=param$ntree) } #Predict label customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata) #Predict prob customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata, type = "prob") customRF$sort <- function(x) x[order(x[,1]),] customRF$levels <- function(x) x$classes mtry <- sqrt(ncol(combined_data)) #create tunegrid tunegrid <- expand.grid(.mtry = c(17, 37, 67, 107), .ntree = c(250, 500, 1000)) # Set up train control with 5-fold cross-validation ctrl <- trainControl(method='repeatedcv', number = 5, repeats=2, verboseIter = TRUE) # Train the random forest model with tuning start.time <- Sys.time() set.seed(123) rf_gridsearch<- train(dataset_type ~ ., data = combined_data, method = customRF, metric="Accuracy", tuneGrid = tunegrid, trControl = ctrl) print(rf_gridsearch) end_time <- Sys.time() execution_time <- end_time - start.time # Print the best model parameters print(rf_gridsearch$bestTune) pdf(file="Figure_S2-d2.pdf", width=8, height=7,paper="special") plot(rf_gridsearch) dev.off() ##constructing URF to validate by means of Out of bag with the best hyperparamenters modelRF_combined <- randomForest(dataset_type~., data=combined_data, mtry = 17, ntree = 500, proximity = TRUE) #mtry 422/3 +1 ##Unsupervised random forest #this chunk of code was produced with help of the following web sources #https://gradientdescending.com/unsupervised-random-forest-example/ #https://stats.stackexchange.com/questions/72370/how-to-perform-unsupervised-random-forest-classification-using-breimans-code ## The tSNE, UMAP and AE algorithm uses random data, thus clustering patterns may vary from run to run ## see the following link for more details https://cran.r-project.org/web/packages/umap/vignettes/umap.html#stability-and-reproducibility start.time2 <- Sys.time() modelhost <- randomForest(acpzikatab, mtry = 17, ntree = 500, proximity = TRUE) end_time2 <- Sys.time() execution_time2 <- end_time2 - start.time2 distaln.matrix <- as.dist(1-modelhost$proximity) cl= get_clust_tendency(as.matrix(distaln.matrix), n = 49, graph = T) protres <- read.table("Metadata_DENV2.txt", sep = "\t", quote="", header = TRUE, row.names = 1) length(protres) #Ensure row names are the same distaln2.matrix<-as.matrix(distaln.matrix) distaln2.df<-as.data.frame(distaln2.matrix) # Add row names as a column to both data frames distaln2.df$ID <- rownames(distaln2.df) protres$ID <- rownames(protres) distaln2.df<- merge(distaln2.df, protres, by="ID") rownames(distaln2.df) <- distaln2.df$ID distaln2.df$ID <- NULL # The palette muted nine https://github.com/JLSteenwyk/ggpubfigs: cbPalette <- c("#332288", "#117733", "#CC6677", "#88CCEE", "#999933", "#882255", "#44AA99", "#DDCC77", "#AA4499") ##setting data for heatmap labeling set.seed(123) segments2 <- c(distaln2.df$Region) segments3 <- c(distaln2.df$Host1) segment_labels2 <- matrix(segments2, ncol = 1) segment_labels3 <- matrix(segments3, ncol = 1) segment_colors2 <- c("Africa" = "#332288", "Asia" = "#117733", "Europe"= "#CC6677", "North_America" = "#88CCEE", "Oceania" = "#999933", "South_America" = "#882255" ) segment_colors3 <- c("Homo_sapiens" = "#44AA99", "Mosquito" = "#DDCC77", "Primate"= "#AA4499" ) colors2 <- segment_colors2[segment_labels2] colors3 <- segment_colors2[segment_labels3] combined_row_colors <- cbind(colors2, colors3) # Create row annotations row_ha <- rowAnnotation( Group1 = segment_labels2, Group2 = segment_labels3, col = list( Group1 = segment_colors2, Group2 = segment_colors3 ) ) # Plot the heatmap with row annotations pdf(file="Figure_S2_heatmap_d2.pdf", width=8, height=7,paper="special") Heatmap(as.matrix(distaln.matrix), name = "Proximity matrix", col =cbPalette, show_row_names = FALSE, # Omit row names show_column_names = FALSE, # Omit column names cluster_rows = TRUE, # Omit row dendrogram cluster_columns = TRUE, # Omit column dendrogram right_annotation = row_ha) dev.off() ##PCA first exploration # this code was adapted from the following web source #https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/ start.time3 <- Sys.time() zika.pca <- prcomp(distaln2.matrix, scale = FALSE) end_time3 <- Sys.time() execution_time3 <- end_time3 - start.time3 pca_scores <- as.data.frame(zika.pca$x) # Add row names as a column to both data frames pca_scores$ID <- rownames(pca_scores) protres$ID <- rownames(protres) pca_scores2<-pca_scores[,c(1:3,1344)] pca_scores2<- merge(pca_scores2, protres, by="ID") rownames(pca_scores2) <- pca_scores2$ID pca_scores2$ID <- NULL #identification of number of clusters f3a<- fviz_nbclust(pca_scores2[,1:2], FUNcluster=kmeans, k.max = 8) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for PCA") # Apply k-means clustering set.seed(123) # For reproducibility #km_pca <- kmeans(pca_scores2[,1:2], centers = 3, nstart = 25) km_pca<-eclust(pca_scores2[,1:2], "kmeans", hc_metric="euclidean",k=5) f3b<- fviz_cluster(km_pca, data=pca_scores2[,1:2], geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("PCA Cluster Plot") table(pca_scores2$Region, km_pca$cluster) #External validation by labeling f4a<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("PCA Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4b<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("PCA Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) ##Creating and ploting tsne results set.seed(123) # for reproducibility start.time4 <- Sys.time() tsne <- Rtsne(distaln2.matrix, dims = 2, perplexity=50, PCA=FALSE, verbose=TRUE, max_iter = 1000, num_threads = 12) end_time4 <- Sys.time() execution_time4 <- end_time4 - start.time4 tsne_data <- as.data.frame(tsne$Y) rownames(tsne_data) <- rownames(distaln2.df) # Add row names as a column to both data frames tsne_data$ID <- rownames(tsne_data) tsne_data<- merge(tsne_data, protres, by="ID") rownames(tsne_data) <- tsne_data$ID tsne_data$ID <- NULL #identification of number of clusters f3c<- fviz_nbclust(tsne_data[,1:2], FUNcluster=kmeans, k.max = 10)+ labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for t-SNE") km_tsne<-eclust(tsne_data[,1:2], "kmeans", hc_metric="euclidean",k=7) f3d<- fviz_cluster(km_tsne, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("t-SNE Cluster Plot") table(tsne_data$Region, km_tsne$cluster) ##External validation by labeling f4c<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("t-SNE Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4d<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("t-SNE Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) #ggplotly(f4d) ###UMAP dimensional reduction library(umap) start.time5 <- Sys.time() zika.umap <- umap(as.matrix(distaln2.matrix), n_components=3, input="dist") end_time5 <- Sys.time() execution_time5 <- end_time5 - start.time5 umap_data <- as.data.frame(zika.umap$layout) rownames(umap_data) <- rownames(umap_data) # Add row names as a column to both data frames umap_data$ID <- rownames(umap_data) umap_data<- merge(umap_data, protres, by="ID") rownames(umap_data) <- umap_data$ID umap_data$ID <- NULL #identification of number of clusters f3e<- fviz_nbclust(umap_data[,1:2], FUNcluster=kmeans, k.max = 10) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for UMAP") km_umap<-eclust(umap_data[,1:2], "kmeans", hc_metric="euclidean",k=6) f3f<- fviz_cluster(km_umap, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal()) + labs(x = "Dim 1", y = "Dim 2") + ggtitle("UMAP Cluster Plot") table(umap_data$Region, km_umap$cluster) #external validation f4e<- ggplot(umap_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("UMAP Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4f<- ggplot(umap_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("UMAP Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) # autoencoder in keras # this code was taken from the following web source #https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/ # set training data set.seed(42) # for reproducibility x_train <- as.matrix(distaln2.matrix) #back to 293 x 293 proximity # set model model <- keras_model_sequential() model %>% layer_dense(units = 12, activation = "tanh", input_shape = ncol(x_train)) %>% layer_dense(units = 6, activation = "tanh", name = "bottleneck") %>% layer_dense(units = 12, activation = "tanh") %>% layer_dense(units = ncol(x_train)) # view model layers summary(model) # compile model model %>% compile( loss = "mean_squared_error", optimizer = "adam" ) start.time6 <- Sys.time() # fit model model %>% fit( x = x_train, y = x_train, epochs = 2000, verbose = 0 ) end_time6 <- Sys.time() execution_time6 <- end_time6 - start.time6 # evaluate the performance of the model mse.ae2 <- evaluate(model, x_train, x_train) mse.ae2 # extract the bottleneck layer intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output) intermediate_output <- predict(intermediate_layer_model, x_train) ae_data <- as.data.frame(intermediate_output) rownames(ae_data) <- rownames(distaln2.df) # Add row names as a column to both data frames ae_data$ID <- rownames(ae_data) ae_data<- merge(ae_data, protres, by="ID") rownames(ae_data) <- ae_data$ID ae_data$ID <- NULL #identification of number of clusters f3g<- fviz_nbclust(ae_data[,1:2], FUNcluster=kmeans, k.max = 8) + labs(y = "Ave. silhouette width") + ggtitle("Optimal clusters for Autoencoders") km_ae<-eclust(ae_data[,1:2], "kmeans", hc_metric="euclidean",k=3) f3h<- fviz_cluster(km_ae, geom = "point", ellipse.type = "norm", palette = "jco", ggtheme = theme_minimal())+ labs(x = "Dim 1", y = "Dim 2") + ggtitle("Autoencoders Cluster Plot") table(ae_data$Region, km_umap$cluster) #External validations f4g<- ggplot(ae_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) + geom_point(size = 2) + theme(panel.background = element_blank()) + theme(legend.text = element_text(size = 9)) + ggtitle("Autoencoders Host")+ labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+ scale_shape_manual(values = c(8, 3, 11)) f4h<- ggplot(ae_data, aes(x = V1, y = V2, colour = Region, shape=Region )) + geom_point(size = 2) + #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), # aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+ theme(panel.background = element_blank()) + ggtitle("Autoencoders Region")+ labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + scale_shape_manual(values = c(17, 18, 24, 3, 11, 8)) ##Figure3 pdf(file="Figure_S3_d2cluster.pdf", width=8, height=10,paper="special") plot_grid(f3a, f3b, f3c, f3d, f3e, f3f, f3g, f3h, labels = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'), label_size = 12, ncol = 2) dev.off() ##figure4 pdf(file="Figure_5_d2.pdf", width=8, height=10,paper="special") plot_grid(f4b, f4d, f4f, f4h, labels = c('A', 'B', 'C', 'D'), label_size = 12, ncol = 2) dev.off() saveRDS(modelhost, file= "modelhost1500.rds") saveRDS(zika.pca, file = "pca_result1500.rds") saveRDS(tsne, file = "tsne_result1500.rds") saveRDS(zika.umap, file = "umap_result1500.rds") saveRDS(intermediate_output, file = "ae_result1500.rds")