# Adapted by Michael MacAskill from original script of KRISTOFFER MAGNUSSON 
# http://rpsychologist.com/an-r-script-to-automatically-look-at-pubmed-citation-counts-by-year-of-publication/
#
#    Please read PubMed E-utility Usage Guidelines here: http://www.ncbi.nlm.nih.gov/books/NBK25497/
#
#    Check for updates and read more about this script at: http://rpsychologist.com  
# 
# Do not run this file directly. Instead, run the accompanying script file:
# MRM_Run_PubMed_analysis R03.R
# which calls the functions in this file as required.
#
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# NB both of these files should have the ".txt" suffix changed to ".R". The ".txt" suffix was
# used purely because of filetype limitations in the PeerJ submission system
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


library('RCurl')
library('XML')
library('plyr')
library('ggplot2')
library('directlabels')


########################
# Download PubMed Data #
########################

PubMedTrend <- function(query, yrStart=1950, yrMax=2009) {
  
  ### Start main search function ###
  getCount <- function(query.term) {
    # convert spaces to '+'
    query.gsub <- gsub(' ', '+', query.term)
    # convert some characters to brower friendly text (better to be safe than sorry)
    #query.gsub <- gsub("'", '%22', query.gsub) # XXXXX Doing this seemed to cause problems as at 15 March 2013 XXXXXX
    query.gsub <- gsub('\\[', '%5B', query.gsub)
    query.gsub <- gsub('\\]', '%5D', query.gsub)
		# required for nesting Boolean logic:
    #query.gsub <- gsub('\\(', '%40', query.gsub)
    #query.gsub <- gsub('\\)', '%41', query.gsub)
    # add progressbar
    pb <- txtProgressBar(min = yrStart, max = yrMax, style = 3)
    # create empty data frame
    df <- data.frame(NULL)
    cat('Searching for: ', query.term,'\n')
    
    # Start retrieval loop
    for(i in yrStart:yrMax) {
      # tell progressbar how it's going
      setTxtProgressBar(pb, i)
      # add publication date [dp] to query
      query.parsed <- paste(query.gsub, '+AND+',i, '%5Bppdat%5D', sep='')
      # Get XML with number of hits for query.parsed
      pub.esearch <- getURL(paste('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&rettype=count&term=', 
                                  query.parsed, sep = ''))
      # Parse XML
      pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE)
      # Get number of hits from XML
      pub.count <- as.numeric(xmlValue(pub.esearch[['doc']][['eSearchResult']][['Count']]))
      # Don't add anything if count is 0
#       if (pub.count != 0) {
#       	df <- rbind(df, data.frame('Year' = i, 'Count' = pub.count))
#       	print(df)
#       }
      df <- rbind(df, data.frame('Year' = i, 'Count' = pub.count))
      # Wait 0.4 sec to avoid hitting the limit imposed by PubMed for queries of >3 per second:
      Sys.sleep(0.4)
    }
    # close progressbar
    close(pb)
    return(df)
  } 
  
  # Run getCount() for each of the query terms
  df <- ldply(query, getCount)
  
  # do some tidying of the result:
  names(df)[names(df)=='.id'] <- 'Term'
  df$Term <- factor(df$Term)
  return(df)

  
}

#######################
### Show total hits ###
#######################
PubTotalHits <- function(args=FALSE) {
  # Get column total for query 'x'
  GetCount <- function(x) {
    df <- data.frame('search_name' = x, 'total_hits' = colSums(df[df$Term == x,][3]))
  }
  # Index all query names
  query.index <- unique(df$Term)
  # Use GetCount() for every term in 'query.index' and return as data.frame
  df <- ldply(query.index, GetCount)
  # if argument is 'query' add full query instead of query name. 
  # if there is no argument specified both name and query will be shown
  if (args == 'query' || args == FALSE) {
    # remove names
    names(query) <- NULL
    # add queries to df
    df <- cbind(df, 'query' = query)
    # reorder columns
    df <- df[,c(1,3,2)]
    # remove 'names' if we only want queries
    if (args == 'query') df <- df[-1]
  }
  return(df)
}