#=============================================================================== #Script for MS Stats using Skyline MS Stats report library(MSstats) #Script roughly follows: #Yaamini's MSStats script: https://github.com/RobertsLab/project-oyster-oa/blob/master/analyses/DNR_Skyline_20170524/2017-06-22-MSstats/2017-06-22-MSstats.R #MSstats manual (starting p44):http://msstats.org/wp-content/uploads/2017/01/MSstats_v3.7.3_manual.pdf #=============================================================================== ########==IMPORT AND PROCESS DATA=============================================== #read in .csv (locally. zip is in github) rawPeakAreas <- read.csv("data/20190403-MSstats-report.csv") head(rawPeakAreas) #rename columns names(rawPeakAreas) <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", "PrecursorMz", "FragmentIon", "ProductCharge", "ProductMz", "IsotopeLabelType", "Condition", "BioReplicate", "FileName", "Area", "StandardType", "Truncated", "DetectionQValue") head(rawPeakAreas) #convert raw Skyline output to MSstats format #rawPeakAreas has 719904 obs and 16 variables PeakAreas <- SkylinetoMSstatsFormat(rawPeakAreas) #PeakAreas has 188508 obs, and 15 variables #Output message: #** Peptides, that are used in more than one proteins, are removed. #** Intensities with great than 0.01 in DetectionQValue are replaced with zero. #** 10 features have all NAs or zero intensity values and are removed. #** 5852 features have 1 or 2 intensities across runs and are removed. #Warning message: # In SkylinetoMSstatsFormat(rawPeakAreas) : NAs introduced by coercion head(PeakAreas) #### Process data ============================================================ QuantData <- dataProcess(PeakAreas, normalization='equalizeMedians', summaryMethod="TMP", cutoffCensored="minFeature", censoredInt="0", MBimpute=TRUE, maxQuantileforCensored=0.999) #Output message: #** There are 9204 intensities which are zero or less than 1. These intensities are replaced with 1. #Error in dataProcess(PeakAreas, normalization = "equalizeMedians", summaryMethod = "TMP", : # ** MSstats suspects that there are fractionations and #potentially technical replicates too. Please add Fraction column in the input. #Add fraction column and have all cells listed as "1" from this google forum post: https://groups.google.com/forum/#!searchin/msstats/fractionations%7Csort:date/msstats/dVh6IEE0xTE/jiokWrNSCgAJ PeakAreas$Fraction <- "1" #Re-try the dataProcess function QuantData <- dataProcess(PeakAreas) #** Log2 intensities under cutoff = 9.6627 were considered as censored missing values. #** Log2 intensities = NA were considered as censored missing values. #** Use all features that the dataset origianally has. # Summary of Features : # count # of Protein 2808 # of Peptides/Protein 1-85 # of Transitions/Peptide 1-5 #** 58 Proteins have only single transition : Consider excluding this protein from the dataset. (CHOYP_BRAFLDRAFT_108193.2.2|m.37878, CHOYP_BRAFLDRAFT_118867.1.1|m.27944, CHOYP_BRAFLDRAFT_120058.1.1|m.9895, CHOYP_BRAFLDRAFT_230143.1.1|m.15015, CHOYP_BRAFLDRAFT_261932.1.1|m.23690, CHOYP_BRAFLDRAFT_275667.1.1|m.17353, CHOYP_BRAFLDRAFT_64737.1.1|m.40499, CHOYP_BRAFLDRAFT_68867.1.1|m.4095, CHOYP_BRAFLDRAFT_72357.1.1|m.59063, CHOYP_BRAFLDRAFT_93126.2.3|m.15451 ...) #Summary of Samples : # 23C 29C # of MS runs 8 8 # of Biological Replicates 2 2 # of Technical Replicates 4 4 #Summary of Missingness : # transitions are completely missing in one condition: 0 # run with 75% missing observations: 0 #After it's all done, check what the ProcessedData looks like head(QuantData$ProcessedData) #large List (5 elements, 101.4 Mb) #=============================================================================== # Create dataprocess plots dataProcessPlots(data = QuantData, type = "ProfilePlot", width=5, height=5, address = "analyses/") #There were 50 or more warnings (use warnings() to see the first 50) #Warning messages: (I am showing first 5) #1: Removed 195 rows containing missing values (geom_path). #2: Removed 780 rows containing missing values (geom_point). #3: Removed 186 rows containing missing values (geom_path). #4: Removed 744 rows containing missing values (geom_point). #5: Removed 99 rows containing missing values (geom_path). dataProcessPlots(data = QuantData, type = "QCPlot", width=5, height=5, address = "analyses/") #There were 50 or more warnings (use warnings() to see the first 50) #Warning messages: (I am showing first 5) #1: Removed 541076 rows containing non-finite values (stat_boxplot). #2: Removed 780 rows containing non-finite values (stat_boxplot). #3: Removed 744 rows containing non-finite values (stat_boxplot). #4: Removed 436 rows containing non-finite values (stat_boxplot). #5: Removed 252 rows containing non-finite values (stat_boxplot). dataProcessPlots(data = QuantData, type="Conditionplot", width=5, height=5, address = "analyses/") #There were 50 or more warnings (use warnings() to see the first 50) #Warning messages: (I am showing first 5) #1: Removed 1 rows containing missing values (geom_hline). #2: Removed 1 rows containing missing values (geom_hline). #3: Removed 1 rows containing missing values (geom_hline). #4: Removed 1 rows containing missing values (geom_hline). #5: Removed 1 rows containing missing values (geom_hline). # CREATE CONTRAST MATRIX=============================================== levels(QuantData$ProcessedData$GROUP_ORIGINAL) #Levels: "23C" and "29C" comparison <- matrix(c(-1, 1), nrow = 1) row.names(comparison) <- "23C-29C" print(comparison) #### GROUP COMPARISON testResultOneComparison <- groupComparison(contrast.matrix = comparison, data = QuantData) testResultOneComparison$ComparisonResult #View results proteinComparisonResults <- testResultOneComparison$ComparisonResult #Save as new dataframe write.csv(proteinComparisonResults, "analyses/20190403-2015Cgseed-protcomp.csv", col.names = c("Protein", "Label", "log2FC", "SE", "Tvalue", "DF", "pvalue"), row.names = F) #Write out data frame # GROUP COMPARISON PLOTS groupComparisonPlots(data = proteinComparisonResults, type = "VolcanoPlot") #Volcano plot, alpha = 0.05. #The volcano plot for 23C-29C does not show the protein names because none of them is significant. groupComparisonPlots(data = proteinComparisonResults, type = "ComparisonPlot") #Comparison plot, alpha = 0.05 #null device # 1 #There were 18 warnings (use warnings() to see them) #Warning messages: (I show first 5) #1: Removed 1 rows containing missing values (geom_hline). #2: Removed 1 rows containing missing values (geom_hline). #3: Removed 1 rows containing missing values (geom_hline). #4: Removed 1 rows containing missing values (geom_hline). #5: Removed 1 rows containing missing values (geom_hline).