# Written by Michael MacAskill, New Zealand Brain Research Institute, 2012-13 # # Based on a PubMed access script by KRISTOFFER MAGNUSSON, described at: # http://rpsychologist.com/an-r-script-to-automatically-look-at-pubmed-citation-counts-by-year-of-publication/ # # Check for updates and read more about the original script at: # http://rpsychologist.com # # Please read PubMed E-utility Usage Guidelines here: # http://www.ncbi.nlm.nih.gov/books/NBK25497/ # # This script runs in the free R statistical environment available from # http://www.r-project.org # # The analysis described in the accompanying paper was run under R version # 2.15.2 (2012-10-26) # # A number of third party libraries are also used, as given in the 'library' # statements in this file and the accompanying source file: # MRM_PubMed_functions R02.R # # A more friendly IDE for R is available from http://www.rstudio.com # # -- version R02 of this file was used to create the originally submitted analysis # -- version R03 was used following the initial reviews to carry out additional journal-specific analyses # # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # NB both of these files should have the ".txt" suffix changed to ".R". The ".txt" suffix was # used purely because of filetype limitations in the PeerJ submission system # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX rm(list = ls()) # remove all objects from workspace library(reshape2) # needed for the 'melt' function to re-arrange data startTime <- Sys.time() # keep track of how long this takes ### You have to set the working directory to where YOUR scripts are located: ### setwd('~/Eye movement/Publications/2013 Eponym apostrophes/New analysis') # Run the other source file, which loads libraries and defines functions for # accessing PubMed (largely courtesy of KRISTOFFER MAGNUSSON): source('MRM_PubMed_functions R03.R') # Construct the search queries we will use: query <- c('PD with' = "Parkinson's disease[TITLE]", 'PD none' = "Parkinson disease[TITLE]", 'AD with' = "(Alzheimer's disease[TITLE] OR Alzheimer's dementia[TITLE])", 'AD none' = "(Alzheimer disease[TITLE] OR Alzheimer dementia[TITLE])", 'HD with' = "(Huntington's disease[TITLE] OR Huntington's chorea[TITLE])", 'HD none' = "(Huntington disease[TITLE] OR Huntington chorea[TITLE])", 'DS with' = "Down's syndrome[TITLE]", 'DS none' = "Down syndrome[TITLE]", 'WD with' = "Wilson's disease[TITLE]", 'WD none' = "Wilson disease[TITLE]", 'GD with' = "Gaucher's disease[TITLE]", 'GD none' = "Gaucher disease[TITLE]") # In a secondary analysis, we examined only PD, in selected journals. e.g: # query <- query[1:2] # PD only # for (i in 1:length(query)) { # query[i] <- paste(query[i],"AND Archives of Neurology[TA]")} # Run the function (defined in the other file) to access PubMed. Put the result # in a dataframe called df: df <- PubMedTrend(query, yrStart=1960, yrMax=2012) # For convenience, rename one of the returned variables to be less cryptic: names(df)[names(df)=='.id'] <- 'Term' df$Term <- factor(df$Term) # Run this to get the total hits for each query in 'query' # Specify if you want to get the whole 'query' or just the 'names' # enter no argument if you want both 'query' and 'name'. e.g: #df.hits <- PubTotalHits() #df.hits2 <- PubTotalHits('query') df.hits <- PubTotalHits('names') # split terms like 'PD with' into separate variables (e.g. Condition = 'PD', # Apostrophe = 'with'): df.hits$Condition <- substr(as.character(df.hits$search_name), 1, 2) df.hits$Apostrophe <- substr(as.character(df.hits$search_name), 4, 7) # rearrange the aggregated data into a convenient form for reporting # in the results: df.hits.melt <- melt(df.hits) df.hits.cast <- dcast(df.hits.melt, Condition ~ Apostrophe) df.hits.cast$Total <- df.hits.cast$none + df.hits.cast$with # How long did all this take? print(Sys.time() - startTime) # store the raw counts: write.table(df, file='Apostrophe counts.txt', sep='\t', row.names=FALSE) # reshape the data into a form suitable for DataGraph to plot: df.melt <- melt(df, id=c('Term', 'Year')) df.cast <- dcast(df.melt, Year ~ Term) write.table(df.cast, file='File for DataGraph.txt', sep='\t', row.names=FALSE) ####################### #### EXAMPLE PLOTS #### ####################### # Note: These plots don't work properly if you only have 1 disorder in the query. # again split terms like 'PD with' into separate variables (e.g. Condition = 'PD', # Apostrophe = 'with'). Will be useful for plotting: df$Condition <- substr(as.character(df$Term), 1, 2) df$Apostrophe <- substr(as.character(df$Term), 4, 7) # reshape the data for plotting: df.melt2 <- melt(df, id=c('Condition', 'Year', 'Term', 'Apostrophe')) df.totalByYear<- dcast(df.melt2, Condition + Year ~ ., sum) names(df.totalByYear)[3] <- 'Total' # Plot the total number of papers for each condition by year: plot.1 <- ggplot(df.totalByYear, aes(Year, Total, group=Condition, color=Condition)) + geom_line(show_guide=F) + xlab('Year') + ylab('Number of publications') direct.label(plot.1, 'last.bumpup') # Plot number of papers with & without apostrophes for each condition by year: plot.2 <- ggplot(df, aes(Year, Count, group=Term, color=Term)) + geom_line(show_guide=F) + xlab('Year') + ylab('Number of publications') + xlim(1960,2020) + geom_dl(aes(label=Term), method='last.bumpup', show_guide=F) # direct labels not based on colour plot.2 # calculate the proportions with apostrophes by year: df.proportion <- dcast(df.melt2, Condition + Year ~ Apostrophe, sum) df.proportion$Percentage <- df.proportion$with /(df.proportion$with + df.proportion$none) *100 # and graph the raw numbers: plot.3 <- ggplot(df.proportion, aes(Year, Percentage, group=Condition, color=Condition)) + geom_line(show_guide=T) + xlab('Year') + ylab('Percentage of publications with apostrophe') + ylim(0,100) direct.label(plot.3, 'last.bumpup') # Plot loess curves (as in the figure in the paper, but these seem to have a different # smoothing width, not sure how to make it exactly comparable to DataGraph's): # plot.4 <- ggplot(df.proportion, aes(Year, Percentage, group=Condition, color=Condition)) + # geom_point() + # geom_smooth(method='loess', se=FALSE, span= 0.5, degree = 2) + # xlab('Year') + # ylab('Percentage of publications with apostrophe') + # ylim(0,100) # direct.label(plot.4, 'last.bumpup')