# Written by Michael MacAskill, New Zealand Brain Research Institute, 2012-13
# 
# Based on a PubMed access script by KRISTOFFER MAGNUSSON, described at:
# http://rpsychologist.com/an-r-script-to-automatically-look-at-pubmed-citation-counts-by-year-of-publication/
#
#    Check for updates and read more about the original script at: 
#      http://rpsychologist.com  
#
#    Please read PubMed E-utility Usage Guidelines here: 
#      http://www.ncbi.nlm.nih.gov/books/NBK25497/
#
# This script runs in the free R statistical environment available from 
# http://www.r-project.org
#
# The analysis described in the accompanying paper was run under R version 
# 2.15.2 (2012-10-26)
#
# A number of third party libraries are also used, as given in the 'library' 
# statements in this file and the accompanying source file: 
# MRM_PubMed_functions R02.R
# 
# A more friendly IDE for R is available from http://www.rstudio.com
# 
# -- version R02 of this file was used to create the originally submitted analysis
# -- version R03 was used following the initial reviews to carry out additional journal-specific analyses
#
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# NB both of these files should have the ".txt" suffix changed to ".R". The ".txt" suffix was
# used purely because of filetype limitations in the PeerJ submission system
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


rm(list = ls()) # remove all objects from workspace
library(reshape2) # needed for the 'melt' function to re-arrange data

startTime <- Sys.time() # keep track of how long this takes

### You have to set the working directory to where YOUR scripts are located: ###
setwd('~/Eye movement/Publications/2013 Eponym apostrophes/New analysis')

# Run the other source file, which loads libraries and defines functions for
# accessing PubMed (largely courtesy of KRISTOFFER MAGNUSSON):
source('MRM_PubMed_functions R03.R')


# Construct the search queries we will use:
query <- c('PD with' = "Parkinson's disease[TITLE]",
           'PD none' = "Parkinson disease[TITLE]",
					 'AD with' = "(Alzheimer's disease[TITLE] OR Alzheimer's dementia[TITLE])",
					 'AD none' = "(Alzheimer disease[TITLE] OR Alzheimer dementia[TITLE])",
					 'HD with' = "(Huntington's disease[TITLE] OR Huntington's chorea[TITLE])",
					 'HD none' = "(Huntington disease[TITLE] OR Huntington chorea[TITLE])",
					 'DS with' = "Down's syndrome[TITLE]",
					 'DS none' = "Down syndrome[TITLE]",
					 'WD with' = "Wilson's disease[TITLE]",
					 'WD none' = "Wilson disease[TITLE]",
					 'GD with' = "Gaucher's disease[TITLE]",
					 'GD none' = "Gaucher disease[TITLE]")

# In a secondary analysis, we examined only PD, in selected journals. e.g:
# query <- query[1:2] # PD only
# for (i in 1:length(query)) {
# 	query[i] <- paste(query[i],"AND Archives of Neurology[TA]")}

# Run the function (defined in the other file) to access PubMed. Put the result
# in a dataframe called df:
df <- PubMedTrend(query, 
									yrStart=1960, 
									yrMax=2012)

# For convenience, rename one of the returned variables to be less cryptic:
names(df)[names(df)=='.id'] <- 'Term'
df$Term <- factor(df$Term)

#  Run this to get the total hits for each query in 'query'
#     Specify if you want to get the whole 'query' or just the 'names'
#     enter no argument  if you want both 'query' and 'name'. e.g:
#df.hits <- PubTotalHits()
#df.hits2 <- PubTotalHits('query')
df.hits <- PubTotalHits('names')

# split terms like 'PD with' into separate variables (e.g. Condition = 'PD',
# Apostrophe = 'with'):
df.hits$Condition <- substr(as.character(df.hits$search_name), 1, 2)
df.hits$Apostrophe <- substr(as.character(df.hits$search_name), 4, 7)

# rearrange the aggregated data into a convenient form for reporting 
# in the results:
df.hits.melt <- melt(df.hits)
df.hits.cast <- dcast(df.hits.melt, Condition ~ Apostrophe)
df.hits.cast$Total <- df.hits.cast$none + df.hits.cast$with

# How long did all this take?
print(Sys.time() - startTime)

# store the raw counts:
write.table(df, file='Apostrophe counts.txt', 
						sep='\t',
						row.names=FALSE)

# reshape the data into a form suitable for DataGraph to plot:
df.melt <- melt(df, id=c('Term', 'Year'))
df.cast <- dcast(df.melt, Year ~ Term)

write.table(df.cast, file='File for DataGraph.txt', 
						sep='\t',
						row.names=FALSE)

#######################
#### EXAMPLE PLOTS ####
#######################

# Note: These plots don't work properly if you only have 1 disorder in the query. 

# again split terms like 'PD with' into separate variables (e.g. Condition = 'PD',
# Apostrophe = 'with'). Will be useful for plotting:
df$Condition <- substr(as.character(df$Term), 1, 2)
df$Apostrophe <- substr(as.character(df$Term), 4, 7)

# reshape the data for plotting:
df.melt2 <- melt(df, id=c('Condition', 'Year', 'Term', 'Apostrophe'))
df.totalByYear<- dcast(df.melt2, Condition + Year ~ ., sum)
names(df.totalByYear)[3] <- 'Total'

# Plot the total number of papers for each condition by year:
plot.1 <- ggplot(df.totalByYear, aes(Year, Total, group=Condition, color=Condition)) + 
	geom_line(show_guide=F) + 
	xlab('Year') +
	ylab('Number of publications') 
direct.label(plot.1, 'last.bumpup')

# Plot number of papers with & without apostrophes for each condition by year:
plot.2 <- ggplot(df, aes(Year, Count, group=Term, color=Term)) + 
  geom_line(show_guide=F) + 
  xlab('Year') +
  ylab('Number of publications') +
	xlim(1960,2020) + 
	geom_dl(aes(label=Term), method='last.bumpup', show_guide=F) # direct labels not based on colour
plot.2

# calculate the proportions with apostrophes by year:
df.proportion <- dcast(df.melt2, Condition + Year ~ Apostrophe, sum)
df.proportion$Percentage <- df.proportion$with /(df.proportion$with + df.proportion$none) *100

# and graph the raw numbers:
plot.3 <- ggplot(df.proportion, aes(Year, Percentage, group=Condition, color=Condition)) + 
	geom_line(show_guide=T) + 
	xlab('Year') +
	ylab('Percentage of publications with apostrophe') +
	ylim(0,100)
direct.label(plot.3, 'last.bumpup')

# Plot loess curves (as in the figure in the paper, but these seem to have a different
# smoothing width, not sure how to make it exactly comparable to DataGraph's):
# plot.4 <- ggplot(df.proportion, aes(Year, Percentage, group=Condition, color=Condition)) + 
# 	geom_point() + 
# 	geom_smooth(method='loess', se=FALSE, span= 0.5, degree = 2) +
# 	xlab('Year') +
# 	ylab('Percentage of publications with apostrophe') + 
# 	ylim(0,100)
# direct.label(plot.4, 'last.bumpup')