library(bibliometrix) library(dplyr) library(purrr) library(stringr) library(grr) library(tidyr) library(ggplot2) library(cowplot) library(pscl) library(MASS) library(car) library(multcomp) library(xtable) library(emmeans) load("") #>---------------------------------------------------------- ##DATA_IMPORT---- # Initialise dataframe processed bib files dta <- data.frame() setwd("") file_list <- list.files() for (file in file_list){ # if the merged dataset does exist, append to it if (exists("dta")){ lines <-readLines(file) dta2 <- convert2df(lines, dbsource = "isi", format = "bibtex") dta <- rbind(dta, dta2) rm(dta2) } } bibdf <- dta # Normalise total citations variable bibdf$TCN <- 0 bibdf$TCN <- ifelse(bibdf$PY == 2010, bibdf$TC/(349+(6*365))*365, bibdf$TCN) bibdf$TCN <- ifelse(bibdf$PY == 2011, bibdf$TC/(349+(5*365))*365, bibdf$TCN) bibdf$TCN <- ifelse(bibdf$PY == 2012, bibdf$TC/(349+(4*365))*365, bibdf$TCN) bibdf$TCN <- ifelse(bibdf$PY == 2013, bibdf$TC/(349+(3*365))*365, bibdf$TCN) bibdf$TCN <- ifelse(bibdf$PY == 2014, bibdf$TC/(349+(2*365))*365, bibdf$TCN) # Sort on total normalised cites bibdf <- bibdf[order(bibdf$TCN, decreasing = T),] # Split most and least cited at median value for total normalised cites bibdf$cite_split <- ifelse(bibdf$TCN > median(bibdf$TCN), 1, 0) bibdf$cite_split <- as.factor(bibdf$cite_split) # Papers with no affiliation data which(is.na(bibdf$C1)) # Remove papers with no affiliation data bibdf <- bibdf[!(is.na(bibdf$C1)), ] # Search for equal co-authorship affiliations grep('contributed equally', bibdf$C1, ignore.case = T) grep('first authors', bibdf$C1, ignore.case = T) grep('last authors', bibdf$C1, ignore.case = T) grep('equal contributions', bibdf$C1, ignore.case = T) # Papers with no 'REPRINT' info that cause the script to stop. reprint <- which(!grepl('reprint', bibdf$C1, ignore.case = T)) # Remove papers with no 'REPRINT' info that cause the script to stop. bibdf <- bibdf[-c(reprint), ] #>---------------------------------------------------------- ##CREATE_DF---- # Initialise output df append <- data.frame(matrix(ncol = 11, nrow = 1)) colnames(append) <- c("Study", "UT" , "Journal", "Year", "Tot_Cites", "Tot_Cites_Norm", "Cite_Split", "Num_Affil", "Num_Auth", "Affil_No", "Freq") append$UT <- as.character(append$UT) append$Journal <- factor(append$Journal) append$Cite_Split <- factor(append$Cite_Split) append$Affil_No <- factor(append$Affil_No) # Loop through studies len <- c(1:length(bibdf$AU)) for(j in seq_along(len)){ # Create Authors data auth_names <- strsplit(bibdf$AU[j], ";") # Split author names (at semi-colon) auth_names <- unlist(auth_names) auth_names <- unique(auth_names) # Only include unique occurrences of authors name # Create Affiliations data split_af <- str_replace_all(bibdf$C1[j], fixed(" "), "") # Remove affiliation whitespace split_af <- strsplit(split_af, ";") # Split by affiliations (at semi-colon) split_af <- split_af[[1]][-1] # Remove redundant first row (reprint author). This address is repeated later affil_length <- length(split_af) split_af <- gsub("([^,]+,[^,]+),", "\\1;", split_af) # Replace every second comma with a semi-colon. This will identify complete names - last, first as one element split_af <- strsplit(split_af, ";") # Split at semi-colon # Delete duplicate names attached to each affiliation - there should only be unique names associated with each affiliation for(i in 1:length(split_af)){ split_af[[i]] <- unique(split_af[[i]]) } split_af <- unlist(split_af) # Delete elements with no comma - these only contain affiliation data z <- !grepl(',', split_af, ignore.case = T) # Logical identifying which elements contain no comma zvec <- which(z==T) # Vector of elements if(!length(zvec)==F){ # Need to skip this step if all elements contain a comma, otherwise zvec is NULL and the script stops split_af <- split_af[-zvec] # Delete these elements from split_af (zvec contains values) } split_af <- strsplit(split_af, ",") # Now split at comma to create last and first name columns mat <- matrix(unlist(split_af), ncol=2, byrow=TRUE) # Create matrix with name info (will also include redundant institution details) df <- as.data.frame(mat) colnames(df) <- c("Last", "First") df$Init <- substr(df$First,1,1) # Extract first name initial - will use this in the name matching df$Last <- paste(df$Last,df$Init) # Attach initial to last name auth_affil <- df$Last # This will be matched with auth_names match <- matches(auth_names, auth_affil) # Determine how many occurrences of each authors name from the affiliations data affil <- table(match$x) # Table from matched occurrences - affil_tab <- table(affil) # Summarise further to determine number of affiliations no_affil <- as.data.frame(affil_tab) colnames(no_affil) <- c("Affil_No", "Freq") no_affil <- cbind("Study" = j, "UT" = bibdf$UT[j], "Journal" = bibdf$SO[j], "Year" = bibdf$PY[j], "Tot_Cites" = bibdf$TC[j], "Tot_Cites_Norm" = bibdf$TCN[j], "Cite_Split" = bibdf$cite_split[j], "Num_Affil" = affil_length, "Num_Auth" = length(auth_names) , no_affil) append <- rbind(no_affil, append) # append data to existing } # Reorder levels of Affil_No append$Affil_No <- factor(append$Affil_No, levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "12")) # Convert to wide format append_wide <- spread(append, Affil_No, Freq) # Rename Affil cols names(append_wide)[c(10:20)] <- c("Affil_1", "Affil_2", "Affil_3", "Affil_4", "Affil_5", "Affil_6", "Affil_7", "Affil_8", "Affil_9", "Affil_10", "Affil_12") # Delete NA row (if Study = NA) del <- which(is.na(append_wide$Study)) append_wide <- append_wide[-del,] # Replace NA's with 0 append_wide[is.na(append_wide)] <- 0 # Delete NA Column append_wide <- append_wide[,-21] # Sort on Study Num append_wide <- append_wide[order(append_wide$Study),] # Create variable indicating max number of affiliations for study temp <- aggregate(as.numeric(Affil_No) ~ Study, data = append, max) append_wide$Affil_Max <- temp[,2] append_wide$Affil_Max <- factor(append_wide$Affil_Max) levels(append_wide$Affil_Max) # There is no level 10 for Affil_Max, because even though there was one study with 10 affiliations, this was the same study that had 12, so 10 wasn't counted. Also, level 11 is actually 12 (12 - as factor - converted to 11 - as numeric). # Quantiles of author numbers - Split at 4, 6, and 10 which represent the 25%, 50% and 75% quantiles for the number of authors on each paper. This categorises author numbers for the modelling quantile(append_wide$Num_Auth) append_wide$Cat_Auth <- 0 append_wide$Cat_Auth[append_wide$Num_Auth < 4] <- 1 append_wide$Cat_Auth[append_wide$Num_Auth > 3 & append_wide$Num_Auth < 6] <- 2 append_wide$Cat_Auth[append_wide$Num_Auth > 5 & append_wide$Num_Auth < 10] <- 3 append_wide$Cat_Auth[append_wide$Num_Auth > 9] <- 4 append_wide$Cat_Auth <- factor(append_wide$Cat_Auth) # # Add Publication Year # append_wide <- cbind(append_wide, "Year" = bibdf$PY) # # Normalise total citations variable # append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2010, append_wide$Tot_Cites/7, append_wide$Tot_Cites_Norm) # append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2011, append_wide$Tot_Cites/6, append_wide$Tot_Cites_Norm) # append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2012, append_wide$Tot_Cites/5, append_wide$Tot_Cites_Norm) # append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2013, append_wide$Tot_Cites/4, append_wide$Tot_Cites_Norm) # append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2014, append_wide$Tot_Cites/3, append_wide$Tot_Cites_Norm) # Subset and create DF to have no more than 6 affiliations. append_wide_sub <- subset(append_wide, as.numeric(Affil_Max) < 7) append_wide_sub <- append_wide_sub[,-16:-20] append_wide_sub$Affil_Max <- factor(append_wide_sub$Affil_Max) #>---------------------------------------------------------- ##SUMMARIES---- # Number of articles length(append_wide$Study) # Total citations over 5 years sum(append_wide$Tot_Cites) # Total author appearances over 5 years - These are not unique sum(append_wide$Num_Auth) # Total affiliations over 5 years sum(append_wide$Num_Affil) # Maximum number of citations on a paper max(append_wide$Tot_Cites) # Maximum normalised citations over 5 years max(append_wide$Tot_Cites_Norm) # Minimum number of citations on a paper min(append_wide$Tot_Cites) # Average number of citations per paper mean(append_wide$Tot_Cites) # Average normalized number of citations per paper mean(append_wide$Tot_Cites_Norm) # Median number of citations per paper median(append_wide$Tot_Cites) # Median normalized number of citations per paper median(append_wide$Tot_Cites_Norm) # Maximum number of authors on a paper max(append_wide$Num_Auth) # Minimum number of authors on a paper min(append_wide$Num_Auth) # Average number of authors per paper mean(append_wide$Num_Auth) # Median number of authors per paper median(append_wide$Num_Auth) # Maximum number of affiliations on a paper max(append_wide$Num_Affil) # Minimum number of affiliations on a paper min(append_wide$Num_Affil) # Average number of affiliations per paper mean(append_wide$Num_Affil) # Median number of affiliations per paper median(append_wide$Num_Affil) # Relative fractions (RFs) of authors for each affiliation append_wide$RF_1 <- append_wide$Affil_1/append_wide$Num_Auth append_wide$RF_2 <- append_wide$Affil_2/append_wide$Num_Auth append_wide$RF_3 <- append_wide$Affil_3/append_wide$Num_Auth append_wide$RF_4 <- append_wide$Affil_4/append_wide$Num_Auth append_wide$RF_5 <- append_wide$Affil_5/append_wide$Num_Auth append_wide$RF_6 <- append_wide$Affil_6/append_wide$Num_Auth RF_means <- map(append_wide[21:26], mean) RF_means <- unlist(RF_means) RF_av_percent <- RF_means*100 # Table giving number of papers in each category (Max_Affil * Cat_Auth) source("") xtabf <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "f") # Frequency xtabj <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "j") # Percent xtable(xtabf$crosstab, digits=0) xtable(xtabj$crosstab) # Diff number of authors - most vs least cited t.test(Num_Auth ~ Cite_Split, append_wide) # Diff number of affiliations - most vs least cited t.test(Num_Affil ~ Cite_Split, append_wide) # Diff in number of authors for Affil=1 t.test(Affil_1 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=2 t.test(Affil_2 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=3 t.test(Affil_3 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=4 t.test(Affil_4 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=5 t.test(Affil_5 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=6 t.test(Affil_6 ~ Cite_Split, append_wide) # Number of papers in most-cited group length(append_wide$Num_Affil[append_wide$Cite_Split==1]) # Number of papers in least-cited group length(append_wide$Num_Affil[append_wide$Cite_Split==0]) # Compare ** number of papers** across each category of affiliation - not mutually exclusive i.e. a paper may have more than one affiliation category and will therefore be counted more than once. Also, Max Affil = 1 does not add up to 27612, because not all papers had at least one Max Affil. There may have been some papers that had only two Max Affils or more. tabfreq <- table(append$Affil_No, append$Cite_Split) xtable(tabfreq) (most_vec <- tabfreq[,2]) # Frequency of affiliations in most cited half of all papers (total_vec <- tabfreq[,1] + tabfreq[,2]) # Total frequency of affiliations across all papers prop.test(most_vec, total_vec) # Proportions test (equivalent to Chi-Square) prop.trend.test(most_vec, total_vec) # Proportions trend test # Compare ** number of authors** across each category of affiliation (Table 1) lowcite <- subset(append_wide, Cite_Split == 0) highcite <- subset(append_wide, Cite_Split == 1) lowcite_sum <- map(lowcite[,10:20], sum) lowcite_sum <- unlist(lowcite_sum) sum(lowcite_sum) highcite_sum <- map(highcite[,10:20], sum) highcite_sum <- unlist(highcite_sum) sum(highcite_sum) row_tot <- lowcite_sum + highcite_sum row_perc <- row_tot/sum(row_tot) col_tot <- c(sum(lowcite_sum), sum(highcite_sum), sum(row_tot), 1) col_perc <- col_tot/sum(row_tot) citedf <- data.frame(lowcite_sum, highcite_sum, row_tot, row_perc) citedf <- rbind(citedf[1:10,], c(0, 0, 0, 0) , citedf[-(1:10),]) #Insert row for Affil 11 citedf <- rbind(citedf, col_tot, col_perc) citedf$row_perc[14] <- NA rownames(citedf)[13] <- "col_tot" rownames(citedf)[14] <- "col_perc" xtable(citedf, digits=0) prop.test(citedf$highcite_sum[1:8], citedf$tot[1:8]) # Proportions test (equivalent to Chi-Square) prop.trend.test(citedf$highcite_sum[1:6], citedf$tot[1:6]) # Proportions trend test #>---------------------------------------------------------- ##MODELLING---- # # LM # summary(mod1a <- lm(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub)) # # Log-Linear # summary(mod1b <- lm(log(Tot_Cites+1) ~ Cat_Auth + Affil_Max, data=append_wide_sub)) # mod1b$coefficients*100 # # NB # summary(mod1c <- glm.nb(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub)) # exp(mod1c$coef) # # Poisson # summary(mod2 <- glm(Tot_Cites ~ Cat_Auth + Affil_Max, family="poisson", data=append_wide_sub)) # exp(mod2$coef) # # Compare # pchisq(2 * (logLik(mod1c) - logLik(mod2)), df = 1, lower.tail = FALSE) # Chi-square high indicating NB better fit. # # Compare conditional means and variances of Tot_Cites within each level of Affil_Max. Variances exceed means suggesting overdispersion. More evidence that NB better model to fit. # with(append_wide_sub, tapply(Tot_Cites, Affil_Max, function(x) {sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))})) # # # Logistic # summary(mod1d <- glm(Cite_Split ~ Cat_Auth + Affil_Max, data=append_wide_sub, family = "binomial")) # exp(mod1d$coef) # LM - Interaction append_wide_sub$Journal <- factor(append_wide_sub$Journal,levels(append_wide_sub$Journal)[c(2,1,4,3)]) summary(mod1e <- lm(Tot_Cites_Norm ~ Affil_Max + Cat_Auth + Affil_Max:Cat_Auth + Year + Journal, data=append_wide_sub)) summary(mod1f <- lm(Tot_Cites_Norm ~ Affil_Max + Cat_Auth + Year + Journal, data=append_wide_sub)) # Global test for dropping interaction anova(mod1e, mod1f) # What is the reference grid set at ref_grid(mod1e) # Marginal Means (mod1e_em <- emmeans(mod1e, ~ Affil_Max + Cat_Auth)) # emmip(mod1e, Affil_Max ~ Cat_Auth) # Test contrasts (LF <- contrast(mod1e_em, list(au1af2 = c(-1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au1af3 = c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au1af4 = c(-1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au2af2 = c(0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au2af3 = c(0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au2af4 = c(0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au3af2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), au3af3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), au3af4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), au4af2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0), au4af3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0), au4af4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0), af1au2 = c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af1au3 = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af1au4 = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), af2au2 = c(0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af2au3 = c(0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af2au4 = c(0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0), af3au2 = c(0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af3au3 = c(0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), af3au4 = c(0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), af4au2 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), af4au3 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), af4au4 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0) ))) confint(LF) test(LF) # Print Supplementary Table library(broom) library(flextable) library(officer) modtab <- tidy(mod1e) modci <- data.frame(confint(mod1e)) modtab2 <- cbind(modtab[, 1:3], modci, modtab[, 4:5]) (ft <- regulartable(modtab2) %>% set_formatter_type(fmt_double = "%0.2f")) doc <- read_docx() %>% body_add_flextable(value = ft) print(doc, target = "") %>% invisible() # Global test for dropping interaction L <- matrix(c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) # Anova - should be same as contrast anova(mod1a, mod1e) # Test Contrasts # Auth Num = 1, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 1, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 1, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,1, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,1,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,1,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,1,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,0,1, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Predictions nd <- cov_pat6[,1:2] # cbind predictions and SE's for 3 models nd <- cbind(nd, Mean_LM = predict(mod1a, newdata = nd, type = "response"), SE_LM = predict(mod1a, newdata = nd, type="response", se.fit = T)$se.fit, Mean_LogLM = predict(mod1b, newdata = nd, type = "response"), SE_LogLM = predict(mod1b, newdata = nd, type="response", se.fit = T)$se.fit, Mean_NB = predict(mod1c, newdata = nd, type = "response"), SE_NB = predict(mod1c, newdata = nd, type="response", se.fit = T)$se.fit) mean_cites <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub, mean) # Back Transform menas and SE's for loglinear mod nd2 <- cbind(nd, Mean_LogLM_Back = exp(nd[,5] + 0.5*summary(mod1b)$sigma^2), SE_LogLM_Back = exp(nd[,5])*nd[,6], Mean_Cites = mean_cites[,3]) # DF of means and CI's nd_ci <- cbind(nd2[,1:2], "Mean_Cites" = nd2[,11], "LM_lower" = nd2[,3]-(1.96*nd2[,4]), "LM_mean" = nd2[,3], "LM_upper" = nd2[,3]+(1.96*nd2[,4]), "LogLM_lower" = nd2[,9]-(1.96*nd2[,10]), "LogLM_mean" = nd2[,9], "LogLM_upper" = nd2[,9]+(1.96*nd2[,10]), "NB_lower" = nd2[,7]-(1.96*nd2[,8]), "NB_mean" = nd2[,7], "NB_upper" = nd2[,7]+(1.96*nd2[,8])) # Convert to long for plotting nd_ci_long <- data.frame(cbind("Cat_Auth" = rep(nd_ci$Cat_Auth,4), "Affil_Max" = rep(nd_ci$Affil_Max,4), "Model" = rep(c(1:4), each=24), "Mean" = c(nd_ci$Mean_Cites, nd_ci$LM_mean, nd_ci$LogLM_mean, nd_ci$NB_mean), "Lower" = c(rep(NA, 24), nd_ci$LM_lower, nd_ci$LogLM_lower, nd_ci$NB_lower), "Upper" = c(rep(NA, 24), nd_ci$LM_upper, nd_ci$LogLM_upper, nd_ci$NB_upper))) # Format Model Output # LM cia <- as.data.frame(confint(mod1a)) mod1a_out <- cbind(mod1a$coefficients, coef(summary(mod1a))[,2], cia[,1], cia[,2], coef(summary(mod1a))[,3], coef(summary(mod1a))[,4]) mod1a_out <- data.frame(mod1a_out) colnames(mod1a_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value") xtable(mod1a_out, digits=3) # Log-LM cib <- as.data.frame(confint(mod1b)) mod1b_out <- cbind(mod1b$coefficients, coef(summary(mod1b))[,2], cib[,1], cib[,2], coef(summary(mod1b))[,3], coef(summary(mod1b))[,4]) mod1b_out <- data.frame(mod1b_out) colnames(mod1b_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value") xtable(mod1b_out, digits=3) # NB cic <- as.data.frame(exp(confint(mod1c))) mod1c_out <- cbind(mod1c$coefficients, exp(mod1c$coefficients), coef(summary(mod1c))[,2], cic[,1], cic[,2], coef(summary(mod1c))[,3], coef(summary(mod1c))[,4]) mod1c_out <- data.frame(mod1c_out) colnames(mod1c_out) <- c("beta", "Rate", "S.E.", "2.5%", "97.5%","Z", "p-value") xtable(mod1c_out, digits=3) # Check fit - p value interpretation (should be > 0.05 for good fit) 1 - pchisq(mod1c$deviance, mod1c$df.residual) # Check fit - resid dev interpretation (should not be above this value for a good fit) qchisq(0.95, df.residual(mod1c)) deviance(mod1c) #>---------------------------------------------------------- ##PLOTS---- # HISTOGRAMS # Hist Citations hist_1 <- ggplot(append_wide, aes(Tot_Cites)) + geom_histogram(binwidth = 1) + scale_x_continuous("Citations/Paper",breaks=seq(0,500,50), limits=c(0, 500)) + labs(y = "Frequency") # Hist Authors hist_2 <- ggplot(append_wide, aes(Num_Auth)) + geom_histogram(binwidth = 1) + scale_x_continuous("Authors/Paper",breaks=seq(0,50,5), limits=c(0, 50)) + labs(y = "Frequency") # Hist Affiliations hist_3 <- ggplot(append_wide, aes(Num_Affil)) + geom_histogram(binwidth = 1) + scale_x_continuous("Affiliations/Paper",breaks=seq(0,50,5), limits=c(0, 50)) + labs(y = "Frequency") plot_grid(hist_1, hist_2, hist_3, align='h', ncol = 1, labels=c('A', 'B', 'C')) med.n <- function(x){ return(c(y = median(x)+2, label = round(median(x),0))) # experiment with the multiplier to find the perfect position } mean.n <- function(x){ return(c(y = -2, label = round(mean(x),0))) # experiment with the multiplier to find the perfect position } # BOXPLOTS ggplot(append_wide_sub, aes(Affil_Max, Tot_Cites_Norm)) + geom_boxplot(outlier.size = 0.2) + coord_cartesian(ylim=c(0, 100)) + facet_grid(~Cat_Auth) + theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.line = element_line(colour = "gray50", size = 0.5, linetype = "solid"), panel.background = element_rect(fill = NA)) + labs(x = "Maximum Affiliation", y = "Normalised Citation Count") + stat_summary(fun.data = mean.n, geom = "text") + stat_summary(fun.data = med.n, geom = "text") # CORRELATIONS cor_df <- append_wide_sub[, c(4,6,7,14,8:13)] cor_df$Affil_Max <- as.numeric(cor_df$Affil_Max) cor_df$Cat_Auth <- as.numeric(cor_df$Cat_Auth) names(cor_df)=c("Citations", "Affiliations","Authors","Max. Affiliation","Affiliation = 1","Affiliation = 2","Affiliation = 3","Affiliation = 4","Affiliation = 5","Affiliation = 6") cor_mat <- cor(cor_df, use="complete.obs") source("") rquery.cormat(cor_df) # BALLOON PLOT # Creating covariate pattern from ungrouped data test <- append_wide[,c(4,19,20)] temp <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data = test, sum) # Create zero values for the missing covariate patterns temp <- rbind(temp, c(1,8,0)) temp <- rbind(temp, c(2,8,0)) temp <- rbind(temp, c(1,9,0)) temp <- rbind(temp, c(2,9,0)) temp <- rbind(temp, c(1,11,0)) temp <- rbind(temp, c(2,11,0)) temp <- rbind(temp, c(3,11,0)) # Only first 6 affiliations. cov_pat6 <- temp[1:24,] # Add offset - number of papers per category cov_pat6$offs <- c(3142, 2715, 2898, 1387, 1371, 2207, 3845, 3374, 454, 811, 1509, 1859, 103, 210, 419, 695, 24, 61, 119, 250, 4, 9, 35, 64) # P 110 of R Graphics Cookbook - Models summed citations - from individual paper data. ggplot(cov_pat6, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites)) + geom_point(shape=21, colour="black", fill="cornsilk") + scale_size_area(max_size=40, guide=F) + labs(x = "Maximum Affiliation", y = "Author Number") + geom_text(aes(label=Tot_Cites), vjust=7, colour="grey50", size=4) # # Expected? summed citations if number of papers in each category was the same (i.e. original data -> fewer papers with higher Max_Affil) # a <- table(append_wide_sub$Cat_Auth,append_wide_sub$Affil_Max) # b <- as.data.frame.matrix(a) # c <- max(b)/b # d <- c(c[,1],c[,2],c[,3],c[,4],c[,5],c[,6]) # e <- cbind(cov_pat6, "adj" = d) # e$Tot_Cites2 <- e$Tot_Cites*e$adj # ggplot(e, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites2)) + # geom_point(shape=21, colour="black", fill="cornsilk") + # scale_size_area(max_size=40, guide=F) + # labs(x = "Maximum Affiliation", y = "Author Category") + # geom_text(aes(label=Tot_Cites2), vjust=7, colour="grey50", size=4) # LINE PLOTS # With CI's nd_ci_long %>% mutate(Model = as.factor(Model)) %>% ggplot(aes(Affil_Max, Mean)) + geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) + scale_linetype_manual(values=c(2,1,1,1)) + geom_ribbon(alpha = .3, aes(ymin = Lower, ymax = Upper, group = Model, fill = Model)) + scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) + scale_fill_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) + facet_grid(. ~ Cat_Auth) # Without CI's # Create gridline cuts minorsy <- seq(50,200,by=10) minorsx <- seq(1,6,by=1) nd_ci_long %>% mutate(Model = as.factor(Model)) %>% ggplot(aes(Affil_Max, Mean)) + geom_hline(mapping=NULL, yintercept=minorsy,colour='grey95') + geom_vline(mapping=NULL, xintercept=minorsx,colour='grey95') + geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) + scale_linetype_manual(values=c(2,1,1,1)) + scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3"), labels=c("Data", "Linear Model", "LogLinear Model", "Negative Binomial Model")) + facet_grid(. ~ Cat_Auth) + scale_x_continuous("Maximum Affiliation", breaks=seq(1,6,1)) + scale_y_continuous("Citations", breaks=seq(0,200,10), limits=c(50, 200)) + theme(legend.position="top", legend.title=element_blank()) + guides(color = guide_legend(override.aes = list(linetype = c(2,1,1,1))), linetype = FALSE)