# Adapted by Michael MacAskill from original script of KRISTOFFER MAGNUSSON # http://rpsychologist.com/an-r-script-to-automatically-look-at-pubmed-citation-counts-by-year-of-publication/ # # Please read PubMed E-utility Usage Guidelines here: http://www.ncbi.nlm.nih.gov/books/NBK25497/ # # Check for updates and read more about this script at: http://rpsychologist.com # # Do not run this file directly. Instead, run the accompanying script file: # MRM_Run_PubMed_analysis R03.R # which calls the functions in this file as required. # # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # NB both of these files should have the ".txt" suffix changed to ".R". The ".txt" suffix was # used purely because of filetype limitations in the PeerJ submission system # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX library('RCurl') library('XML') library('plyr') library('ggplot2') library('directlabels') ######################## # Download PubMed Data # ######################## PubMedTrend <- function(query, yrStart=1950, yrMax=2009) { ### Start main search function ### getCount <- function(query.term) { # convert spaces to '+' query.gsub <- gsub(' ', '+', query.term) # convert some characters to brower friendly text (better to be safe than sorry) #query.gsub <- gsub("'", '%22', query.gsub) # XXXXX Doing this seemed to cause problems as at 15 March 2013 XXXXXX query.gsub <- gsub('\\[', '%5B', query.gsub) query.gsub <- gsub('\\]', '%5D', query.gsub) # required for nesting Boolean logic: #query.gsub <- gsub('\\(', '%40', query.gsub) #query.gsub <- gsub('\\)', '%41', query.gsub) # add progressbar pb <- txtProgressBar(min = yrStart, max = yrMax, style = 3) # create empty data frame df <- data.frame(NULL) cat('Searching for: ', query.term,'\n') # Start retrieval loop for(i in yrStart:yrMax) { # tell progressbar how it's going setTxtProgressBar(pb, i) # add publication date [dp] to query query.parsed <- paste(query.gsub, '+AND+',i, '%5Bppdat%5D', sep='') # Get XML with number of hits for query.parsed pub.esearch <- getURL(paste('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&rettype=count&term=', query.parsed, sep = '')) # Parse XML pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE) # Get number of hits from XML pub.count <- as.numeric(xmlValue(pub.esearch[['doc']][['eSearchResult']][['Count']])) # Don't add anything if count is 0 # if (pub.count != 0) { # df <- rbind(df, data.frame('Year' = i, 'Count' = pub.count)) # print(df) # } df <- rbind(df, data.frame('Year' = i, 'Count' = pub.count)) # Wait 0.4 sec to avoid hitting the limit imposed by PubMed for queries of >3 per second: Sys.sleep(0.4) } # close progressbar close(pb) return(df) } # Run getCount() for each of the query terms df <- ldply(query, getCount) # do some tidying of the result: names(df)[names(df)=='.id'] <- 'Term' df$Term <- factor(df$Term) return(df) } ####################### ### Show total hits ### ####################### PubTotalHits <- function(args=FALSE) { # Get column total for query 'x' GetCount <- function(x) { df <- data.frame('search_name' = x, 'total_hits' = colSums(df[df$Term == x,][3])) } # Index all query names query.index <- unique(df$Term) # Use GetCount() for every term in 'query.index' and return as data.frame df <- ldply(query.index, GetCount) # if argument is 'query' add full query instead of query name. # if there is no argument specified both name and query will be shown if (args == 'query' || args == FALSE) { # remove names names(query) <- NULL # add queries to df df <- cbind(df, 'query' = query) # reorder columns df <- df[,c(1,3,2)] # remove 'names' if we only want queries if (args == 'query') df <- df[-1] } return(df) }