--- title: "Syrrhophus Pairwise Distances, revised 27Nov" subtitle: "Previous versions had a bug!" author: David Cannatella date: "`r format(Sys.time(), '%B %e, %Y')`" output: bookdown::html_document2: toc: yes toc_float: yes self_contained: TRUE number_sections: TRUE toc_depth: 4 pandoc_args: --number-offset=15 html-math-method: katex keep_md: yes highlight: rstudio theme: cerulean --- ```{r setup, echo = FALSE, include = FALSE} # Get sequences from fasta file using ape # Calculate pairwise distances among all individuals # Group individuals into species and calculate mean of # pairwise distances among species. knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(comment = "#>") knitr::opts_chunk$set(echo = TRUE, warning = F, message = F) library(ape) library(bookdown) library(phytools) library(gdata) # upperTriangle() library(tidyverse) library(lubridate) library(ggplot2) library(magrittr) library(forcats) #options(warn = 2) #library(graph4lg) #library(pillar) ``` Any version of this script prior to 27Nov had a bug related to how to fill the diagonals of the matrix, in the Mean_pairwise() function. ```{r} # Function to compute the among-species mean pairwise distance # for an alignment of one gene Mean_pairwise <- function (sequence_file, name_map, myModel) { # debug code in next three lines #name_map <- "data/Syrrhophus_Pairwise_Name_Map_27Nov.txt" #sequence_file <- "data/West-Syrrhophus-16S-27Nov-unique-160tips.nex" #myModel <- "N" # Map sequence names to a species (group), # Read sequences and calculate p-distance matrix my_Name_Mapping <- read_tsv(name_map, col_names = T, col_types = "cc") #my_Sequences <- read.FASTA(sequence_file) my_Sequences <- read.nexus.data(sequence_file) my_Sequences <- nexus2DNAbin(my_Sequences) myModel <- if (myModel == "N") "N" else "raw" my_Dist_Matrix <- as.matrix(dist.dna(my_Sequences, model = myModel, pairwise.deletion = T)) # Make a df from matrix row names, column names, and cell values for # further processing row <- rownames(my_Dist_Matrix) col <- colnames(my_Dist_Matrix) my_Distance_df <- as_tibble(expand.grid(row, col, stringsAsFactors = F)) my_Distance_df <- mutate(my_Distance_df, distance = as.vector(my_Dist_Matrix )) # Add species group names to df for each row colnames(my_Distance_df) <- c("row", "col", "distance") my_Renamed_Distances <- left_join(my_Distance_df, my_Name_Mapping, by = c("row" = "original_name")) my_Renamed_Distances <- left_join(my_Renamed_Distances, my_Name_Mapping, by = c("col" = "original_name")) # DEBUG #unique(my_Renamed_Distances$new_name.x) #unique(my_Renamed_Distances$new_name.y) # Rearrange taxon names to follow consistent order of # pairwise comparisons for (i in 1:nrow(my_Renamed_Distances)) { if (my_Renamed_Distances$new_name.x[i] > my_Renamed_Distances$new_name.y[i]) { temp <- my_Renamed_Distances$new_name.x[i] my_Renamed_Distances$new_name.x[i] <- my_Renamed_Distances$new_name.y[i] my_Renamed_Distances$new_name.y[i] <- temp } } # DEBUG # unique(my_Renamed_Distances$new_name.x) # unique(my_Renamed_Distances$new_name.y) # Plot my_Renamed_Distances for each species my_Plot_distances <- my_Renamed_Distances %>% mutate(new_name = as.factor(paste(new_name.x, new_name.y, sep = "-"))) %>% mutate(new_name = fct_reorder(new_name, distance, .fun = "mean")) # Group distances by species-pairs and summarize stats my_Final_Distances_Stats <- my_Renamed_Distances %>% group_by(new_name.x, new_name.y) %>% summarise(mean = mean(distance), min = min(distance), max = max(distance), med = median(distance)) length(my_Final_Distances_Stats$mean) # DEBUG next two lines # right_now <- str_replace_all(as.character(now()), "[-: ]", "_") # write_tsv(my_Final_Distances_Stats, file = paste0("output/my_Final_Abs_Distances_Stats_", right_now, ".txt")) Species_Names_Alphabetical <- group_keys(my_Final_Distances_Stats)$new_name.x my_Final_Dist_Matrix <- matrix(nrow = length(Species_Names_Alphabetical), ncol = length(Species_Names_Alphabetical)) rownames(my_Final_Dist_Matrix) <- Species_Names_Alphabetical colnames(my_Final_Dist_Matrix) <- Species_Names_Alphabetical ## The previous bug was here. I mistakenly set diag = F, instead ## of filling the diagonal with values! It is fixed now. 27 Nov # Fill lower triangle lowerTriangle(my_Final_Dist_Matrix, diag = T, byrow = F) <- my_Final_Distances_Stats$mean # Fill upper to make a symmetric matrix upperTriangle(my_Final_Dist_Matrix, diag = T, byrow = T) <- my_Final_Distances_Stats$mean # Return the results return(list(my_Final_Dist_Matrix, my_Plot_distances, my_Final_Distances_Stats)) } ``` ```{r } Plot_Distances <- function (my_Plot_Distances, gene, Distance_Type) { # Plot distances against all species pairs, sorted my_combined_Plot <- ggplot(my_Plot_Distances, aes(x = new_name, y = distance, fct_reorder(new_name, distance, .fun = "mean"))) + geom_point(size = 0.7, stroke = 0.1, shape = 21, alpha = 0.5, fill = "gray50", aes(color = gene)) + #scale_color_manual(values = c("16S" = "black")) + scale_color_manual(values = c("black", "red")) + stat_summary( #fun.y = mean, fun = mean, geom = "point", mapping = aes(color = "mean"), size = 0.8 ) + theme(axis.text.x = element_text(size = 3, angle = 45, hjust = 1.0, vjust = 1.0), axis.title = element_text(size = 14), panel.background = element_rect(fill = "white", colour = "black"), panel.grid.major.y = element_line(linewidth = 0.2, color = "gray70"), panel.grid.minor.y = element_line(linewidth = 0.2, color = "gray70"), panel.grid.major.x = element_line(linewidth = 0.2, color = "gray70"), legend.background = element_rect(colour = NA), legend.margin = margin(0.1, 0.1, 0.1, 0.1, unit = "cm"), legend.key = element_rect(fill = "white", colour = NULL), legend.key.size = unit(1.0, "lines"), legend.text = element_text(size = rel(0.8)), legend.title = element_text(hjust = 0.5), legend.title.align = NULL, legend.position = c(0.1, 0.8), legend.justification = "center", legend.box.background = element_rect(colour = "black")) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1, shape = 19))) + scale_x_discrete() + scale_y_continuous(breaks = waiver(), minor_breaks = waiver()) + theme(plot.title = element_text(size = 20, hjust = 0.5), axis.title = element_text(size = 12)) + ylab(paste("Pairwise", Distance_Type, "distance")) + xlab("Pairwise comparisons between species") ggsave(paste("output/", gene, Distance_Type, "pairwise_distances.pdf"), my_combined_Plot, width = 36, height = 8) return() } ``` ```{r ComputeDistances} # Name of gene for labeling plot myGene <- "16S" # nexus alignment of sequences my_Sequences <- "data/West-Syrrhophus-16S-27Nov-unique-160tips.nex" # Mapping of sequence name to group name # It is OK if this file has more mappings than are needed. The script # ignores any that are not needed my_Name_Map <- "data/Syrrhophus_Pairwise_Name_Map_27Nov.txt" # Calculate distance matrix, and assign list items to objects # The third parameter: "raw" means P-Distance, "N" means Absolute differences my_Raw <- Mean_pairwise(my_Sequences, my_Name_Map, "raw") my_Raw_matrix <- unlist(my_Raw[[1]]) my_Raw_plot_distances <- my_Raw[[2]] %>% mutate (gene = myGene) my_Raw_Stats <- as_tibble(my_Raw[[3]]) # Calculate second distance matrix, and assign list items to objects # The third parameter: "raw" means P-Distance, "N" means Absolute differences my_Abs <- Mean_pairwise(my_Sequences, my_Name_Map, "N") my_Abs_matrix <- unlist(my_Abs[[1]]) my_Abs_plot_distances <- my_Abs[[2]] %>% mutate (gene = myGene) my_Abs_Stats <- as_tibble(my_Abs[[3]]) # Plot distances against all species pairs, sorted Plot_Distances(my_Abs_plot_distances, myGene, "Abs") Plot_Distances(my_Raw_plot_distances, myGene, "Raw") # Define empty matrix, assign row/col names, and fill with upper and lower # triangle of distances my_Combined_Matrix <- matrix(nrow = nrow(my_Abs_matrix), ncol = ncol(my_Abs_matrix), dimnames = list(rownames(my_Abs_matrix), colnames(my_Abs_matrix))) lowerTriangle(my_Combined_Matrix) <- lowerTriangle(my_Raw_matrix, byrow = F) upperTriangle(my_Combined_Matrix) <- upperTriangle(my_Abs_matrix, byrow = F) # DEBUG next three lines # dim(my_Raw_matrix) # dim(my_Abs_matrix) # dim(my_Combined_Matrix) # Set date and time as part of filename # and write pairwise distance matrix to file right_now <- str_replace_all(as.character(now()), "[-: ]", "_") write.table(signif(my_Combined_Matrix, digits = 3), file = paste0("output/Combined_dist_matrices_", right_now, ".txt"), row.names = T, col.names = T, quote = F, sep = "\t") # Write the summary stats (mean, min, etc.) to a file after rounding off numbers my_Raw_Stats <- my_Raw_Stats %>% mutate(across(tidyselect::where(is.numeric), signif, digits = 3)) write_tsv(my_Raw_Stats, file = paste0("output/Raw_Distances_Stats_", right_now, ".txt")) # Write the summary stats (mean, min, etc.) to a file my_Abs_Stats <- my_Abs_Stats %>% mutate(across(tidyselect::where(is.numeric), signif, digits = 3)) write_tsv(my_Abs_Stats, file = paste0("output/Abs_Distances_Stats_", right_now, ".txt")) # end ```