setwd("D:\\URF_viruses\\zika_virus2023\\Dengue\\Den2\\analisis")
#setwd("/home/edgar/Escritorio/zika_virus2023/Analisis/aln_datos")

##This pipeline was assembled by Edgar E. Lara-Ramírez
##the adapted code sources are specified in the corresponding sections


library("seqinr")
library("Peptides")
library("protr")
library("adegenet")
library("gplots")
library("ComplexHeatmap")
library("caret")
library("randomForest")
library("doParallel")
library("FactoMineR")
library("factoextra")
library("cluster")
library("plotly")
library("Rtsne")
library("umap")
library("reshape2")
library("ggplot2")
require("gridExtra")
library("gridGraphics")
library("vcd")
library("cowplot")
suppressPackageStartupMessages(library(keras))
##read alignment
maftaln<- read.alignment("DENV2_aln.fasta", format ="fasta")
##extracting polymorphic aminoacids
acpzika<-alignment2genind(maftaln)
acpzikatab<-tab(acpzika) #extract pairwise amino acid frequencies coded in 0 and 1
sapply(acpzikatab, function(x) sum(is.na(x)))#countig NAs
acpzikatab[is.na(acpzikatab)] <- 0 #replacing nas

#setting colnames
new_colnames <- make.names(gsub("\\.", "_", colnames(acpzikatab)))
colnames(acpzikatab) <- new_colnames
acpzikatab<-as.data.frame(acpzikatab)

###permutation
# Function to permute the rows of each column
permute_columns <- function(data) {
  permuted_data <- apply(data, 2, function(column) sample(column))
  return(as.data.frame(permuted_data))
}

# Create the synthetic dataset
synthetic_data <- permute_columns(acpzikatab)

# Verify the synthetic data
head(synthetic_data)

# Assign column names to synthetic data
colnames(synthetic_data) <- colnames(acpzikatab)

# Validate and compare synthetic_data with original_data
summary(acpzikatab)
summary(synthetic_data)

# Add row labels to original_data and synthetic_data
original_data <- cbind(acpzikatab, dataset_type = "original")
synthetic_data <- cbind(synthetic_data, dataset_type = "synthetic")

# Concatenate the datasets
combined_data <- as.data.frame(rbind(original_data, synthetic_data))
combined_data$dataset_type=as.factor(combined_data$dataset_type)


#Hyperparameter tunning
#https://rpubs.com/phamdinhkhanh/389752
##CUSTOM RF https://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/
customRF <- list(type = "Classification",
                 library = "randomForest",
                 loop = NULL)

customRF$parameters <- data.frame(parameter = c("mtry", "ntree"),
                                  class = rep("numeric", 2),
                                  label = c("mtry", "ntree"))

customRF$grid <- function(x, y, len = NULL, search = "grid") {}

customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs) {
  randomForest(x, y,
               mtry = param$mtry,
               ntree=param$ntree)
}

#Predict label
customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
  predict(modelFit, newdata)

#Predict prob
customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
  predict(modelFit, newdata, type = "prob")

customRF$sort <- function(x) x[order(x[,1]),]
customRF$levels <- function(x) x$classes

mtry <- sqrt(ncol(combined_data))

#create tunegrid 
tunegrid <- expand.grid(.mtry = c(17, 37, 67, 107), .ntree = c(250, 500, 1000))

# Set up train control with 5-fold cross-validation
ctrl <- trainControl(method='repeatedcv', number = 5, repeats=2, verboseIter = TRUE)

# Train the random forest model with tuning
start.time <- Sys.time()
set.seed(123)
rf_gridsearch<- train(dataset_type ~ ., data = combined_data, method = customRF,
                      metric="Accuracy", tuneGrid = tunegrid,
                      trControl = ctrl)

print(rf_gridsearch)
end_time <- Sys.time()
execution_time <- end_time - start.time

# Print the best model parameters
print(rf_gridsearch$bestTune)

pdf(file="Figure_S2-d2.pdf", width=8, height=7,paper="special")
plot(rf_gridsearch)
dev.off()

##constructing URF to validate by means of Out of bag with the best hyperparamenters
modelRF_combined <- randomForest(dataset_type~., data=combined_data, mtry = 17, ntree = 500, proximity = TRUE) #mtry 422/3 +1


##Unsupervised random forest
#this chunk of code was produced with help of the following web sources
#https://gradientdescending.com/unsupervised-random-forest-example/
#https://stats.stackexchange.com/questions/72370/how-to-perform-unsupervised-random-forest-classification-using-breimans-code
## The tSNE, UMAP and AE algorithm uses random data, thus clustering patterns may vary from run to run
## see the following link for more details https://cran.r-project.org/web/packages/umap/vignettes/umap.html#stability-and-reproducibility
start.time2 <- Sys.time()
modelhost <- randomForest(acpzikatab, mtry = 17, ntree = 500, proximity = TRUE)
end_time2 <- Sys.time()
execution_time2 <- end_time2 - start.time2

distaln.matrix <- as.dist(1-modelhost$proximity)

cl= get_clust_tendency(as.matrix(distaln.matrix), n = 49, graph = T)

protres <- read.table("Metadata_DENV2.txt", sep = "\t",  quote="", header = TRUE, row.names = 1)
length(protres)
#Ensure row names are the same
distaln2.matrix<-as.matrix(distaln.matrix)
distaln2.df<-as.data.frame(distaln2.matrix)
# Add row names as a column to both data frames
distaln2.df$ID <- rownames(distaln2.df)
protres$ID <- rownames(protres)
distaln2.df<- merge(distaln2.df, protres, by="ID")
rownames(distaln2.df) <- distaln2.df$ID
distaln2.df$ID <- NULL

# The palette muted nine https://github.com/JLSteenwyk/ggpubfigs:
cbPalette <- c("#332288", "#117733", "#CC6677", "#88CCEE", "#999933", "#882255", "#44AA99", "#DDCC77", "#AA4499")

##setting data for heatmap labeling

set.seed(123)
segments2 <- c(distaln2.df$Region)
segments3 <- c(distaln2.df$Host1)

segment_labels2 <- matrix(segments2, ncol = 1)
segment_labels3 <- matrix(segments3, ncol = 1)


segment_colors2 <- c("Africa" = "#332288", 
                     "Asia" = "#117733",
                     "Europe"= "#CC6677",
                     "North_America" = "#88CCEE",
                     "Oceania" = "#999933",
                     "South_America" = "#882255"
)

segment_colors3 <- c("Homo_sapiens" = "#44AA99", 
                     "Mosquito" = "#DDCC77",
                     "Primate"= "#AA4499"
)
colors2 <- segment_colors2[segment_labels2]
colors3 <- segment_colors2[segment_labels3]
combined_row_colors <- cbind(colors2, colors3)

# Create row annotations
row_ha <- rowAnnotation(
  Group1 = segment_labels2,
  Group2 = segment_labels3,
  col = list(
    Group1 = segment_colors2,
    Group2 = segment_colors3
  )
)

# Plot the heatmap with row annotations
pdf(file="Figure_S2_heatmap_d2.pdf", width=8, height=7,paper="special")
Heatmap(as.matrix(distaln.matrix),
        name = "Proximity matrix",
        col =cbPalette,
        show_row_names = FALSE,    # Omit row names
        show_column_names = FALSE, # Omit column names
        cluster_rows = TRUE,      # Omit row dendrogram
        cluster_columns = TRUE,    # Omit column dendrogram
        right_annotation = row_ha)
dev.off()


##PCA first exploration
# this code was adapted from the following web source
#https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/

start.time3 <- Sys.time()

zika.pca <- prcomp(distaln2.matrix, scale = FALSE)

end_time3 <- Sys.time()
execution_time3 <- end_time3 - start.time3

pca_scores <- as.data.frame(zika.pca$x)

# Add row names as a column to both data frames
pca_scores$ID <- rownames(pca_scores)
protres$ID <- rownames(protres)
pca_scores2<-pca_scores[,c(1:3,1344)]

pca_scores2<- merge(pca_scores2, protres, by="ID")
rownames(pca_scores2) <- pca_scores2$ID
pca_scores2$ID <- NULL

#identification of number of clusters
f3a<- fviz_nbclust(pca_scores2[,1:2], FUNcluster=kmeans, k.max = 8) +
  labs(y = "Ave. silhouette width") +
  ggtitle("Optimal clusters for PCA")

# Apply k-means clustering
set.seed(123)  # For reproducibility
#km_pca <- kmeans(pca_scores2[,1:2], centers = 3, nstart = 25)
km_pca<-eclust(pca_scores2[,1:2], "kmeans", hc_metric="euclidean",k=5)
f3b<- fviz_cluster(km_pca, data=pca_scores2[,1:2], geom = "point", ellipse.type = "norm",
                   palette = "jco", ggtheme = theme_minimal()) +
  labs(x = "Dim 1", y = "Dim 2") +
  ggtitle("PCA Cluster Plot")
table(pca_scores2$Region, km_pca$cluster)


#External validation by labeling
f4a<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Host1, shape=Host1)) +
  geom_point(size = 2) + theme(panel.background = element_blank()) + 
  theme(legend.text = element_text(size = 9)) + ggtitle("PCA Host")+
  labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+
  scale_shape_manual(values = c(8, 3, 11))

f4b<- ggplot(pca_scores2, aes(x = PC1, y = PC2, colour = Region, shape=Region )) +
  geom_point(size = 2) + 
  #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), 
  #       aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+
  theme(panel.background = element_blank()) + 
  ggtitle("PCA Region")+
  labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + 
  scale_shape_manual(values = c(17, 18, 24, 3, 11, 8))


##Creating and ploting tsne results

set.seed(123) # for reproducibility
start.time4 <- Sys.time()

tsne <- Rtsne(distaln2.matrix, dims = 2, perplexity=50, PCA=FALSE, verbose=TRUE, max_iter = 1000, num_threads = 12)

end_time4 <- Sys.time()
execution_time4 <- end_time4 - start.time4

tsne_data <- as.data.frame(tsne$Y)
rownames(tsne_data) <- rownames(distaln2.df)

# Add row names as a column to both data frames
tsne_data$ID <- rownames(tsne_data)
tsne_data<- merge(tsne_data, protres, by="ID")
rownames(tsne_data) <- tsne_data$ID
tsne_data$ID <- NULL

#identification of number of clusters
f3c<- fviz_nbclust(tsne_data[,1:2], FUNcluster=kmeans, k.max = 10)+
  labs(y = "Ave. silhouette width") +
  ggtitle("Optimal clusters for t-SNE")

km_tsne<-eclust(tsne_data[,1:2], "kmeans", hc_metric="euclidean",k=7)
f3d<- fviz_cluster(km_tsne, geom = "point", ellipse.type = "norm",
                   palette = "jco", ggtheme = theme_minimal()) +
  labs(x = "Dim 1", y = "Dim 2") +
  ggtitle("t-SNE Cluster Plot")
table(tsne_data$Region, km_tsne$cluster)


##External validation by labeling
f4c<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) +
  geom_point(size = 2) + theme(panel.background = element_blank()) + 
  theme(legend.text = element_text(size = 9)) + ggtitle("t-SNE Host")+
  labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+
  scale_shape_manual(values = c(8, 3, 11))

f4d<- ggplot(tsne_data, aes(x = V1, y = V2, colour = Region, shape=Region )) +
  geom_point(size = 2) + 
  #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), 
  #       aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+
  theme(panel.background = element_blank()) + 
  ggtitle("t-SNE Region")+
  labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + 
  scale_shape_manual(values = c(17, 18, 24, 3, 11, 8))
#ggplotly(f4d)


###UMAP dimensional reduction
library(umap)
start.time5 <- Sys.time()

zika.umap <- umap(as.matrix(distaln2.matrix), n_components=3, input="dist")

end_time5 <- Sys.time()
execution_time5 <- end_time5 - start.time5

umap_data <- as.data.frame(zika.umap$layout)
rownames(umap_data) <- rownames(umap_data)

# Add row names as a column to both data frames
umap_data$ID <- rownames(umap_data)
umap_data<- merge(umap_data, protres, by="ID")
rownames(umap_data) <- umap_data$ID
umap_data$ID <- NULL

#identification of number of clusters
f3e<- fviz_nbclust(umap_data[,1:2], FUNcluster=kmeans, k.max = 10) + 
  labs(y = "Ave. silhouette width") +
  ggtitle("Optimal clusters for UMAP")

km_umap<-eclust(umap_data[,1:2], "kmeans", hc_metric="euclidean",k=6)
f3f<- fviz_cluster(km_umap, geom = "point", ellipse.type = "norm",
                   palette = "jco", ggtheme = theme_minimal()) +
  labs(x = "Dim 1", y = "Dim 2") +
  ggtitle("UMAP Cluster Plot")
table(umap_data$Region, km_umap$cluster)

#external validation
f4e<- ggplot(umap_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) +
  geom_point(size = 2) + theme(panel.background = element_blank()) + 
  theme(legend.text = element_text(size = 9)) + ggtitle("UMAP Host")+
  labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+
  scale_shape_manual(values = c(8, 3, 11))

f4f<- ggplot(umap_data, aes(x = V1, y = V2, colour = Region, shape=Region )) +
  geom_point(size = 2) + 
  #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), 
  #       aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+
  theme(panel.background = element_blank()) + 
  ggtitle("UMAP Region")+
  labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + 
  scale_shape_manual(values = c(17, 18, 24, 3, 11, 8))

# autoencoder in keras
# this code was taken from the following web source
#https://www.r-bloggers.com/2018/07/pca-vs-autoencoders-for-dimensionality-reduction/

# set training data
set.seed(42) # for reproducibility
x_train <- as.matrix(distaln2.matrix) #back to 293 x 293 proximity
# set model
model <- keras_model_sequential()
model %>%
  layer_dense(units = 12, activation = "tanh", input_shape = ncol(x_train)) %>%
  layer_dense(units = 6, activation = "tanh", name = "bottleneck") %>%
  layer_dense(units = 12, activation = "tanh") %>%
  layer_dense(units = ncol(x_train))
# view model layers
summary(model)

# compile model
model %>% compile(
  loss = "mean_squared_error", 
  optimizer = "adam"
)

start.time6 <- Sys.time()

# fit model
model %>% fit(
  x = x_train, 
  y = x_train, 
  epochs = 2000,
  verbose = 0
)

end_time6 <- Sys.time()
execution_time6 <- end_time6 - start.time6

# evaluate the performance of the model
mse.ae2 <- evaluate(model, x_train, x_train)
mse.ae2

# extract the bottleneck layer
intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output)
intermediate_output <- predict(intermediate_layer_model, x_train)

ae_data <- as.data.frame(intermediate_output)
rownames(ae_data) <- rownames(distaln2.df)

# Add row names as a column to both data frames
ae_data$ID <- rownames(ae_data)
ae_data<- merge(ae_data, protres, by="ID")
rownames(ae_data) <- ae_data$ID
ae_data$ID <- NULL

#identification of number of clusters
f3g<- fviz_nbclust(ae_data[,1:2], FUNcluster=kmeans, k.max = 8) +
  labs(y = "Ave. silhouette width") +
  ggtitle("Optimal clusters for Autoencoders")

km_ae<-eclust(ae_data[,1:2], "kmeans", hc_metric="euclidean",k=3)
f3h<- fviz_cluster(km_ae, geom = "point", ellipse.type = "norm",
                   palette = "jco", ggtheme = theme_minimal())+
  labs(x = "Dim 1", y = "Dim 2") +
  ggtitle("Autoencoders Cluster Plot")
table(ae_data$Region, km_umap$cluster)

#External validations
f4g<- ggplot(ae_data, aes(x = V1, y = V2, colour = Host1, shape=Host1)) +
  geom_point(size = 2) + theme(panel.background = element_blank()) + 
  theme(legend.text = element_text(size = 9)) + ggtitle("Autoencoders Host")+
  labs(x = "Dim 1", y = "Dim 2") + scale_color_manual(values = cbPalette)+
  scale_shape_manual(values = c(8, 3, 11))

f4h<- ggplot(ae_data, aes(x = V1, y = V2, colour = Region, shape=Region )) +
  geom_point(size = 2) + 
  #geom_point(data = subset(pca_scores2, PC1 == 2 & PC2 == 3), 
  #       aes(x = PC1, y = PC2), size = 4, shape = 17, colour = "blue")+
  theme(panel.background = element_blank()) + 
  ggtitle("Autoencoders Region")+
  labs(x = "Dim 1", y = "Dim 2")+ scale_color_manual(values = cbPalette) + 
  scale_shape_manual(values = c(17, 18, 24, 3, 11, 8))

##Figure3
pdf(file="Figure_S3_d2cluster.pdf", width=8, height=10,paper="special")
plot_grid(f3a, f3b, f3c, f3d, f3e, f3f, f3g, f3h, labels = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'), label_size = 12, ncol = 2)
dev.off()


##figure4
pdf(file="Figure_5_d2.pdf", width=8, height=10,paper="special")
plot_grid(f4b, f4d, f4f, f4h, labels = c('A', 'B', 'C', 'D'), label_size = 12, ncol = 2)
dev.off()

saveRDS(modelhost, file= "modelhost1500.rds")
saveRDS(zika.pca, file = "pca_result1500.rds")
saveRDS(tsne, file = "tsne_result1500.rds")
saveRDS(zika.umap, file = "umap_result1500.rds")
saveRDS(intermediate_output, file = "ae_result1500.rds")