library(limma); library(GEOquery); library(desiR)


#-------------------------------------------
# Read in data + clean up

# download and load data
gds <- getGEO("GDS1329", destdir=".")

# convert to expressionSet
d <- GDS2eSet(gds)

# remove spike-ins 
d <- d[-grep("AFFX", rownames(d)), ]

# remove genes with no gene ID
rm.ind <- as.character(fData(d)[,"Gene ID"]) == ""
d <- d[-rm.ind, ]

# remove the apocrine group (not interesting for this example)
d <- d[, pData(d)$disease.state != "apocrine tumor"]


#-------------------------------------------
# Differential expression

# make design matrix
design <- model.matrix(~ droplevels(pData(d)$disease.state))
colnames(design) <- c("intercept", "bas.v.lum")


# fit model
fit <- lmFit(d, design)
fit2 <- eBayes(fit)


# number of significant genes
sum(p.adjust(fit2$p.value[,"bas.v.lum"], method="fdr") < 0.05) # 4830


# table of results (do not sort!!)
top <- topTable(fit2, n = nrow(d),
                 genelist = fData(d)[,"Gene symbol"],
                 coef = "bas.v.lum", sort="none")


#-------------------------------------------
# Calculate further values for ranking

# calculate SD for each probeset
top$SD <- apply(exprs(d), 1, sd)

# calculate probeset correlation with PCNA (only 1 PCNA probeset)
pcna.ind <- grep("^PCNA$", fData(d)[,"Gene symbol"])
top$pcna <- apply(exprs(d), 1, function(x) cor(x, exprs(d)[pcna.ind, ]))

# which probesets are associated with inflammation?
inflam.genes <- grep("inflamm", fData(d)[,"GO:Process"],
                     ignore.case = TRUE)
top$inflam <- 1:nrow(d) %in% inflam.genes

table(top$inflam) # 733 inflammation probe sets


#-------------------------------------------
# Apply desirability functions

# plot values and add desirability functions
# (tweak parameters until suitable values are chosen)
par(mfrow=c(3,2),
    mar=c(5,1,2,1),
    las=1)
hist(top$P.Value, breaks=35, col="grey", border="white",
     xlab="P-value", ylab="", yaxt="n", main="weight = 1")
des.line(top$P.Value, "d.low", des.args=c(cut1=0.0001,
                                   cut2=0.1, scale=2))
mtext("A", side=3, line=0.5, adj=0, font=2)

hist(top$logFC, breaks=35, col="grey", border="white",
     xlab="Log FC", ylab="", yaxt="n", main="weight = 1")
des.line(top$logFC, "d.ends", des.args=c(cut1=log2(1/3),
         cut2=log2(1/1.5), cut3=log2(1.5), cut4=log2(3),
         scale=0.5))
mtext("B", side=3, line=0.5, adj=0, font=2)

hist(top$AveExpr, breaks=35, col="grey", border="white",
     xlab="Mean expression", ylab="", yaxt="n", main="weight = 0.1")
des.line(top$AveExpr, "d.high", des.args=c(cut1=5.75, cut2=7,
                                    scale=0.5))
mtext("C", side=3, line=0.5, adj=0, font=2)

hist(top$SD, breaks=30, col="grey", border="white",
     xlab="SD of expression", ylab="", yaxt="n", main="weight = 0.1")
des.line(top$SD, "d.high", des.args=c(cut1=0.1, cut2=0.3, scale=0.5))
mtext("D", side=3, line=0.5, adj=0, font=2)

hist(abs(top$pcna), breaks=35, col="grey", border="white",
     xlab="Absolute correlation with PCNA", ylab="", yaxt="n",
     main="weight = 0.5")
des.line(abs(top$pcna), "d.low",  des.args=c(cut1=0.15, cut2=0.5,
         scale=1.5, des.min = 0.05))
mtext("E", side=3, line=0.5, adj=0, font=2)

plot(table(top$inflam), type="h", xlim=c(0, 3),
     ylab="", yaxt="n", col="grey", lwd=5,
     xlab="Associated with inflammation?",
     main="weight = 0.5", xaxt="n")
axis(1, tick=NA, at=c(1,2), labels=c("No","Yes"))

par(new = TRUE); plot.window(xlim=c(0,3), ylim=c(0,1))
segments(c(0.95, 1.5, 1.5), c(0.2, 1, 0.2),
         c(1.5, 2.05, 1.5), c(0.2, 1, 1) )
mtext("F", side=3, line=0.5, adj=0, font=2)


# calculate desirabilities (using same values as above)
top$d.pval <- d.low(top$P.Value, cut1=0.0001, cut2=0.1, scale=2)

top$d.fc   <- d.ends(top$logFC, cut1=log2(1/3), cut2=log2(1/1.5),
                     cut3=log2(1.5), cut4=log2(3), scale=0.5)

top$d.avg  <- d.high(top$AveExpr, cut1=5.75, cut2=7, scale=0.5)

top$d.sd   <- d.high(top$SD, cut1=0.1, cut2=0.3, scale=0.5)

top$d.pcna <- d.low(abs(top$pcna), cut1=0.15, cut2=0.5,
                    scale=1.5, des.min = 0.05)

top$d.inflam <- ifelse(top$inflam, 1, 0.2)


# calculate overall desirablity (choose weights... must be in same
# order as desirabilities)
top$D <- d.overall(top$d.avg, top$d.sd, top$d.pval,
                   top$d.fc, top$d.pcna, top$inflam,
                   weights=c(0.1, 0.1, 1, 1, 0.5, 0.5)
                   )

# set 0.7 as an arbitrary cut off
sum(top$D > 0.7) # 46


# volcano plot
par(las=1,
    mar=c(4.2,4.2,1,1))
plot(-log10(P.Value) ~ logFC, data=top, col="darkgrey",
     cex=0.75, xlab=~Log[2]~Fold~change,
     ylab=~-log[10]~(P~value))

# overplot the probesets above 0.7 (arbitrary cutoff)
points(-log10(P.Value) ~ logFC, pch=16, 
       data=top[top$D > 0.7, ])

# horizontal reference line at p = 0.05
abline(h=-log10(0.05), lty=2)
text(y=-log10(0.05), x=4, labels="p = 0.05",
     pos=3, font=2)


# reorder the results table
top2 <- top[order(top$D, decreasing=TRUE), ]
View(head(top2, 200))


# reorder by p-value and hard filter for mean and SD expression
top.p <- top2[order(top2$P.Value), ]
top.p <- subset(top.p, AveExpr > 6 & SD > 0.25)


# where are genes in a list ranked by p-value
match(rownames(top2)[1:10], rownames(top.p))


# plot for Figure 1
# generate values to convert to desirabilities
x <- seq(0,1, length.out=500)

par(mfrow=c(3,2),
    mar=c(3,4.5,3,1),
    las=1)

# "high is good"
d1 <- d.high(x, cut1 = 0.2, cut2 = 0.8)
plot(d1 ~ x, ylab="Desirability", ylim=c(0,1), type="l",
     main="High", xaxt="n")
axis(1, at=0:1, labels=c("min","max"))

# "low is good", with non-default scale and min.des arguments
d2 <- d.low(x, cut1 = 0.2, cut2 = 0.8, scale = 0.5, des.min = 0.1)
plot(d2 ~ x, ylab="Desirability", ylim=c(0,1), type="l",
     main="Low", xaxt="n")
axis(1, at=0:1, labels=c("min","max"))

# "extreme values are good"
d3 <- d.ends(x, cut1 = 0.2, cut2 = 0.3, cut3 = 0.7, cut4 = 0.8)
plot(d3 ~ x, ylab="Desirability", ylim=c(0,1), type="l",
     main="Ends", xaxt="n")
axis(1, at=0:1, labels=c("min","max"))

# "central or middle values are good"
d4 <- d.central(x, cut1 = 0.1, cut2 = 0.2, cut3 = 0.6, cut4 = 0.8)
plot(d4 ~ x, ylab="Desirability", ylim=c(0,1), type="l",
     main="Central", xaxt="n")
axis(1, at=0:1, labels=c("min","max"))

# "high is good" with a four parameter logistic function
d5 <- d.4pl(x, hill = 5, inflec = 0.5)
plot(d5 ~ x, ylab="Desirability", ylim=c(0,1), type="l",
     main="Logistic", xaxt="n")
axis(1, at=0:1, labels=c("min","max"))

# categorical: C is most desirable, followed by B
barplot(c(0.1, 0.5, 1), ylim=c(0,1.1),ylab="Desirability",
        main="Categorical", names.arg=c("A","B","C")); box()


sessionInfo()