### PCA tidyverse style ### #install.packages("devtools") #install.packages("tidyverse", type = "binary") #install.packages("dlookr") #devtools::install_github("tidymodels/broom", type = "binary") #install.packages("knitr") #install.packages("ggfortify") #install.packages("cowplot", type = "binary") #install.packages("plotly", type = "binary") #install.packages("ggrepel", type = "binary") #install.packages("bit", type = "binary") #devtools::install_github("vqv/ggbiplot") library(tidyverse) library(dlookr) library(broom) library(knitr) library(ggfortify) library(cowplot) library(plotly) library(ggrepel) library(ggbiplot) # custom qualitative color palettes palette_colorblind <- c( "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999" ) palette_cb_ext <- c( "#ebac23", "#b80058", "#008cf9", "#006e00", "#00bbad", "#d163e6", "#b24502", "#ff9287", "#5954d6", "#00c6f8", "#878500", "#00a76c", "#979797", "#1e1e1e" ) par(mfrow=c(2,2)) # output color previews on 2 lines show_col(palette_colorblind) # colorblind-friendly palette show_col(palette_cb_ext) # extended colorblind-friendly palette ## Load complete data (no missing values) ## # Read csv file as tibble morph <- read_csv("data/humb_maurus.csv", col_types = cols(ID = "c", Species = "f")) head(morph) # Check for skewness with dlookr package # find names of skewed variables find_skewness(morph, index = FALSE) # compute the skewness #find_skewness(morph, value = TRUE) # log2 transform continuous variables log2.morph <- morph %>% transmute( ID = ID, Species = Species, l.HW = log2(HW), l.SVL = log2(SVL), l.IOD = log2(IOD), l.ED = log2(ED), l.IND = log2(IND), l.EN = log2(EN), l.FL = log2(FL), l.TD = log2(TD), l.THL = log2(THL), l.HAL = log2(HAL), l.FLL = log2(FLL), l.Fin3L = log2(Fin3L), l.Fin3DW = log2(Fin3DW), l.Fin4L = log2(Fin4L), l.Fin4DW = log2(Fin4DW) ) ## Center and scale data to unit variance and perform PCA ## pca_fit <- log2.morph %>% select(where(is.numeric)) %>% prcomp(center = TRUE, scale = TRUE) # Combine PC coordinates with original dataset to visualize samples colored by # categorical variable (Species) present in original data but removed for PCA. # Do this using the augment() function from the broom package, which uses the # fitted model and original data. Columns with fitted coordinates are # .fittedPC1, .fittedPC2, etc. # assign fitted model plus original data to new variable pca_fit2 pca_fit2 <- pca_fit %>% augment(log2.morph) ## Look at variance explained by each PC ## pca_fit %>% tidy(matrix = "eigenvalues") # Make barplot of PCs x <- pca_fit %>% tidy(matrix = "eigenvalues") %>% ggplot(aes(PC, percent)) + xlab ("PC") + ylab ("Percent Variance") + geom_col(alpha = 0.8) + scale_x_continuous(breaks = 1:9) + scale_y_continuous(labels = scales::percent_format(), expand = expansion(mult = c(0, 0.01))) + theme_minimal_hgrid(12) plot(x) # Plot biplot colored by species to look for separation between groups p <- ggplot(pca_fit2, aes( x = .fittedPC1, y = .fittedPC2, color = Species, text = paste("Specimen:", ID) )) + geom_point( aes(color = Species, fill = Species), shape = 21, size = 3, color = "black" ) + theme_half_open(12) + background_grid() plot(p) # use ggplotly to convert ggplot2 object to plotly object for interactive plot ggplotly(p) ## Extract and plot the rotation matrix ## # Extract rotation matrix pca_fit %>% tidy(matrix = "rotation") # Define arrow style for plotting arrow_style <- arrow( angle = 20, ends = "first", type = "closed", length = grid::unit(8, "pt") ) # Examine factor loadings from rotation matrix PCA_var <- pca_fit %>% # Extract variable coordinates tidy(matrix = "rotation") %>% # Format table form long to wide pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>% # Rename column with variable names rename(Variable = column) %>% # 'Clean' variable names # Upper case on first letter mutate(Variable = stringr::str_to_title(Variable)) %>% # Change '_' for space mutate(Variable = stringr::str_replace_all(Variable, "_", " ")) head(PCA_var) # Plot rotation matrix; use geom_text_repel to avoid label overlap pca_fit %>% tidy(matrix = "rotation") %>% pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>% ggplot(aes(PC1, PC2)) + geom_segment(xend = 0, yend = 0, arrow = arrow_style) + geom_text_repel( aes(label = column), hjust = 1, nudge_x = -0.02, color = "#904C2F" ) + xlim(-0.25, 0.5) + ylim(-0.5, 0.5) + coord_fixed() + # fix aspect ratio to 1:1 # labs(title = 'Rotation Matrix', x='PC1 (XX%)', y='PC2 (XX%)' + theme_minimal() ## Look at variance explained by each PC --------------------------------------- pca_fit %>% tidy(matrix = "eigenvalues") # Make scree plot x <- pca_fit %>% tidy(matrix = "eigenvalues") %>% ggplot(aes(PC, percent)) + xlab ("PC") + ylab ("Percent Variance") + geom_col(alpha = 0.8) + scale_x_continuous(breaks = 1:9) + scale_y_continuous(labels = scales::percent_format(), expand = expansion(mult = c(0, 0.01))) + theme_minimal_hgrid(12) plot(x) # Box plots of individual variables by species in PCAviz # Fin4DW for example y <- ggplot(morph, aes( x = Species, y = Fin4DW, fill = Species, text = paste("ID:", ID) )) + geom_violin() + scale_color_manual(values = c(maurus = "#04A23B", humboldti = "#D55E00")) + theme_bw(base_size = 16) ggplotly(y) # use ggplotly to convert ggplot2 object to plotly object devtools::session_info() ## adapted from CLaus Wilke: clauswilke.com/blog/2020/09/07/pca-tidyverse-style/