library(limma); library(GEOquery); library(desiR) #------------------------------------------- # Read in data + clean up # download and load data gds <- getGEO("GDS1329", destdir=".") # convert to expressionSet d <- GDS2eSet(gds) # remove spike-ins d <- d[-grep("AFFX", rownames(d)), ] # remove genes with no gene ID rm.ind <- as.character(fData(d)[,"Gene ID"]) == "" d <- d[-rm.ind, ] # remove the apocrine group (not interesting for this example) d <- d[, pData(d)$disease.state != "apocrine tumor"] #------------------------------------------- # Differential expression # make design matrix design <- model.matrix(~ droplevels(pData(d)$disease.state)) colnames(design) <- c("intercept", "bas.v.lum") # fit model fit <- lmFit(d, design) fit2 <- eBayes(fit) # number of significant genes sum(p.adjust(fit2$p.value[,"bas.v.lum"], method="fdr") < 0.05) # 4830 # table of results (do not sort!!) top <- topTable(fit2, n = nrow(d), genelist = fData(d)[,"Gene symbol"], coef = "bas.v.lum", sort="none") #------------------------------------------- # Calculate further values for ranking # calculate SD for each probeset top$SD <- apply(exprs(d), 1, sd) # calculate probeset correlation with PCNA (only 1 PCNA probeset) pcna.ind <- grep("^PCNA$", fData(d)[,"Gene symbol"]) top$pcna <- apply(exprs(d), 1, function(x) cor(x, exprs(d)[pcna.ind, ])) # which probesets are associated with inflammation? inflam.genes <- grep("inflamm", fData(d)[,"GO:Process"], ignore.case = TRUE) top$inflam <- 1:nrow(d) %in% inflam.genes table(top$inflam) # 733 inflammation probe sets #------------------------------------------- # Apply desirability functions # plot values and add desirability functions # (tweak parameters until suitable values are chosen) par(mfrow=c(3,2), mar=c(5,1,2,1), las=1) hist(top$P.Value, breaks=35, col="grey", border="white", xlab="P-value", ylab="", yaxt="n", main="weight = 1") des.line(top$P.Value, "d.low", des.args=c(cut1=0.0001, cut2=0.1, scale=2)) mtext("A", side=3, line=0.5, adj=0, font=2) hist(top$logFC, breaks=35, col="grey", border="white", xlab="Log FC", ylab="", yaxt="n", main="weight = 1") des.line(top$logFC, "d.ends", des.args=c(cut1=log2(1/3), cut2=log2(1/1.5), cut3=log2(1.5), cut4=log2(3), scale=0.5)) mtext("B", side=3, line=0.5, adj=0, font=2) hist(top$AveExpr, breaks=35, col="grey", border="white", xlab="Mean expression", ylab="", yaxt="n", main="weight = 0.1") des.line(top$AveExpr, "d.high", des.args=c(cut1=5.75, cut2=7, scale=0.5)) mtext("C", side=3, line=0.5, adj=0, font=2) hist(top$SD, breaks=30, col="grey", border="white", xlab="SD of expression", ylab="", yaxt="n", main="weight = 0.1") des.line(top$SD, "d.high", des.args=c(cut1=0.1, cut2=0.3, scale=0.5)) mtext("D", side=3, line=0.5, adj=0, font=2) hist(abs(top$pcna), breaks=35, col="grey", border="white", xlab="Absolute correlation with PCNA", ylab="", yaxt="n", main="weight = 0.5") des.line(abs(top$pcna), "d.low", des.args=c(cut1=0.15, cut2=0.5, scale=1.5, des.min = 0.05)) mtext("E", side=3, line=0.5, adj=0, font=2) plot(table(top$inflam), type="h", xlim=c(0, 3), ylab="", yaxt="n", col="grey", lwd=5, xlab="Associated with inflammation?", main="weight = 0.5", xaxt="n") axis(1, tick=NA, at=c(1,2), labels=c("No","Yes")) par(new = TRUE); plot.window(xlim=c(0,3), ylim=c(0,1)) segments(c(0.95, 1.5, 1.5), c(0.2, 1, 0.2), c(1.5, 2.05, 1.5), c(0.2, 1, 1) ) mtext("F", side=3, line=0.5, adj=0, font=2) # calculate desirabilities (using same values as above) top$d.pval <- d.low(top$P.Value, cut1=0.0001, cut2=0.1, scale=2) top$d.fc <- d.ends(top$logFC, cut1=log2(1/3), cut2=log2(1/1.5), cut3=log2(1.5), cut4=log2(3), scale=0.5) top$d.avg <- d.high(top$AveExpr, cut1=5.75, cut2=7, scale=0.5) top$d.sd <- d.high(top$SD, cut1=0.1, cut2=0.3, scale=0.5) top$d.pcna <- d.low(abs(top$pcna), cut1=0.15, cut2=0.5, scale=1.5, des.min = 0.05) top$d.inflam <- ifelse(top$inflam, 1, 0.2) # calculate overall desirablity (choose weights... must be in same # order as desirabilities) top$D <- d.overall(top$d.avg, top$d.sd, top$d.pval, top$d.fc, top$d.pcna, top$inflam, weights=c(0.1, 0.1, 1, 1, 0.5, 0.5) ) # set 0.7 as an arbitrary cut off sum(top$D > 0.7) # 46 # volcano plot par(las=1, mar=c(4.2,4.2,1,1)) plot(-log10(P.Value) ~ logFC, data=top, col="darkgrey", cex=0.75, xlab=~Log[2]~Fold~change, ylab=~-log[10]~(P~value)) # overplot the probesets above 0.7 (arbitrary cutoff) points(-log10(P.Value) ~ logFC, pch=16, data=top[top$D > 0.7, ]) # horizontal reference line at p = 0.05 abline(h=-log10(0.05), lty=2) text(y=-log10(0.05), x=4, labels="p = 0.05", pos=3, font=2) # reorder the results table top2 <- top[order(top$D, decreasing=TRUE), ] View(head(top2, 200)) # reorder by p-value and hard filter for mean and SD expression top.p <- top2[order(top2$P.Value), ] top.p <- subset(top.p, AveExpr > 6 & SD > 0.25) # where are genes in a list ranked by p-value match(rownames(top2)[1:10], rownames(top.p)) # plot for Figure 1 # generate values to convert to desirabilities x <- seq(0,1, length.out=500) par(mfrow=c(3,2), mar=c(3,4.5,3,1), las=1) # "high is good" d1 <- d.high(x, cut1 = 0.2, cut2 = 0.8) plot(d1 ~ x, ylab="Desirability", ylim=c(0,1), type="l", main="High", xaxt="n") axis(1, at=0:1, labels=c("min","max")) # "low is good", with non-default scale and min.des arguments d2 <- d.low(x, cut1 = 0.2, cut2 = 0.8, scale = 0.5, des.min = 0.1) plot(d2 ~ x, ylab="Desirability", ylim=c(0,1), type="l", main="Low", xaxt="n") axis(1, at=0:1, labels=c("min","max")) # "extreme values are good" d3 <- d.ends(x, cut1 = 0.2, cut2 = 0.3, cut3 = 0.7, cut4 = 0.8) plot(d3 ~ x, ylab="Desirability", ylim=c(0,1), type="l", main="Ends", xaxt="n") axis(1, at=0:1, labels=c("min","max")) # "central or middle values are good" d4 <- d.central(x, cut1 = 0.1, cut2 = 0.2, cut3 = 0.6, cut4 = 0.8) plot(d4 ~ x, ylab="Desirability", ylim=c(0,1), type="l", main="Central", xaxt="n") axis(1, at=0:1, labels=c("min","max")) # "high is good" with a four parameter logistic function d5 <- d.4pl(x, hill = 5, inflec = 0.5) plot(d5 ~ x, ylab="Desirability", ylim=c(0,1), type="l", main="Logistic", xaxt="n") axis(1, at=0:1, labels=c("min","max")) # categorical: C is most desirable, followed by B barplot(c(0.1, 0.5, 1), ylim=c(0,1.1),ylab="Desirability", main="Categorical", names.arg=c("A","B","C")); box() sessionInfo()