library(bibliometrix)
library(dplyr)
library(purrr)
library(stringr)
library(grr)
library(tidyr)
library(ggplot2)
library(cowplot)
library(pscl)
library(MASS)
library(car)
library(multcomp)
library(xtable)
library(emmeans)
load("")

#>----------------------------------------------------------

##DATA_IMPORT----

# Initialise dataframe processed bib files
dta <- data.frame()

setwd("")
file_list <- list.files()

for (file in file_list){
  # if the merged dataset does exist, append to it
  if (exists("dta")){
    lines <-readLines(file)
    dta2 <- convert2df(lines, dbsource = "isi", format = "bibtex")
    dta <- rbind(dta, dta2)
    rm(dta2)
  }
}

bibdf <- dta


# Normalise total citations variable
bibdf$TCN <- 0
bibdf$TCN <- ifelse(bibdf$PY == 2010,  bibdf$TC/(349+(6*365))*365, bibdf$TCN)
bibdf$TCN <- ifelse(bibdf$PY == 2011,  bibdf$TC/(349+(5*365))*365, bibdf$TCN)
bibdf$TCN <- ifelse(bibdf$PY == 2012,  bibdf$TC/(349+(4*365))*365, bibdf$TCN)
bibdf$TCN <- ifelse(bibdf$PY == 2013,  bibdf$TC/(349+(3*365))*365, bibdf$TCN)
bibdf$TCN <- ifelse(bibdf$PY == 2014,  bibdf$TC/(349+(2*365))*365, bibdf$TCN)


# Sort on total normalised cites
bibdf <- bibdf[order(bibdf$TCN, decreasing = T),]
# Split most and least cited at median value for total normalised cites
bibdf$cite_split <- ifelse(bibdf$TCN > median(bibdf$TCN), 1, 0)
bibdf$cite_split <- as.factor(bibdf$cite_split)

# Papers with no affiliation data
which(is.na(bibdf$C1))
# Remove papers with no affiliation data
bibdf <- bibdf[!(is.na(bibdf$C1)), ]

# Search for equal co-authorship affiliations
grep('contributed equally', bibdf$C1, ignore.case = T)
grep('first authors', bibdf$C1, ignore.case = T)
grep('last authors', bibdf$C1, ignore.case = T)
grep('equal contributions', bibdf$C1, ignore.case = T)

# Papers with no 'REPRINT' info that cause the script to stop.
reprint <- which(!grepl('reprint', bibdf$C1, ignore.case = T))
# Remove papers with no 'REPRINT' info that cause the script to stop.
bibdf <- bibdf[-c(reprint), ]


#>----------------------------------------------------------

##CREATE_DF----

# Initialise output df
append <- data.frame(matrix(ncol = 11, nrow = 1))
colnames(append) <- c("Study", "UT" , "Journal", "Year", "Tot_Cites", "Tot_Cites_Norm", "Cite_Split", "Num_Affil", "Num_Auth", "Affil_No", "Freq")
append$UT <- as.character(append$UT)
append$Journal <- factor(append$Journal)
append$Cite_Split <- factor(append$Cite_Split)
append$Affil_No <- factor(append$Affil_No)

# Loop through studies
len <- c(1:length(bibdf$AU))

for(j in seq_along(len)){
  # Create Authors data
  auth_names <- strsplit(bibdf$AU[j], ";") # Split author names (at semi-colon)
  auth_names <- unlist(auth_names)
  auth_names <- unique(auth_names) # Only include unique occurrences of authors name
  # Create Affiliations data
  split_af <- str_replace_all(bibdf$C1[j], fixed(" "), "") # Remove affiliation whitespace
  split_af <- strsplit(split_af, ";") # Split by affiliations (at semi-colon)
  split_af <- split_af[[1]][-1] # Remove redundant first row (reprint author). This address is repeated later
  affil_length <- length(split_af)
  split_af <- gsub("([^,]+,[^,]+),", "\\1;", split_af) # Replace every second comma with a semi-colon. This will identify complete names - last, first as one element
  split_af <- strsplit(split_af, ";") # Split at semi-colon
  # Delete duplicate names attached to each affiliation - there should only be unique names associated with each affiliation
  for(i in 1:length(split_af)){
    split_af[[i]] <- unique(split_af[[i]]) 
  }
  split_af <- unlist(split_af)
  # Delete elements with no comma - these only contain affiliation data
  z <- !grepl(',', split_af, ignore.case = T) # Logical identifying which elements contain no comma
  zvec <- which(z==T) # Vector of elements
  if(!length(zvec)==F){ # Need to skip this step if all elements contain a comma, otherwise zvec is NULL and the script stops
  split_af <- split_af[-zvec] # Delete these elements from split_af (zvec contains values)
  }
  split_af <- strsplit(split_af, ",") # Now split at comma to create last and first name columns
  mat  <- matrix(unlist(split_af), ncol=2, byrow=TRUE) # Create matrix with name info (will also include redundant institution details)
  df   <- as.data.frame(mat)
  colnames(df) <- c("Last", "First")
  df$Init <- substr(df$First,1,1) # Extract first name initial - will use this in the name matching
  df$Last <- paste(df$Last,df$Init) # Attach initial to last name
  auth_affil <- df$Last # This will be matched with auth_names
  match <- matches(auth_names, auth_affil) # Determine how many occurrences of each authors name from the affiliations data
  affil <- table(match$x) # Table from matched occurrences - 
  affil_tab <- table(affil) # Summarise further to determine number of affiliations
  no_affil <- as.data.frame(affil_tab)
  colnames(no_affil) <- c("Affil_No", "Freq")
  no_affil <- cbind("Study" = j, "UT" =  bibdf$UT[j], "Journal" = bibdf$SO[j], "Year" = bibdf$PY[j], "Tot_Cites" = bibdf$TC[j], "Tot_Cites_Norm" = bibdf$TCN[j], "Cite_Split" = bibdf$cite_split[j], "Num_Affil" = affil_length, "Num_Auth" = length(auth_names) , no_affil)
  append <- rbind(no_affil, append) # append data to existing
}

# Reorder levels of Affil_No
append$Affil_No <- factor(append$Affil_No, levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "12"))


# Convert to wide format
append_wide <- spread(append, Affil_No, Freq)
# Rename Affil cols 
names(append_wide)[c(10:20)] <- c("Affil_1", "Affil_2", "Affil_3", "Affil_4", "Affil_5", "Affil_6", "Affil_7", "Affil_8", "Affil_9", "Affil_10", "Affil_12")
# Delete NA row (if Study = NA)
del <- which(is.na(append_wide$Study))
append_wide <- append_wide[-del,]
# Replace NA's with 0
append_wide[is.na(append_wide)] <- 0
# Delete NA Column
append_wide <- append_wide[,-21]


# Sort on Study Num
append_wide <- append_wide[order(append_wide$Study),]
# Create variable indicating max number of affiliations for study
temp <- aggregate(as.numeric(Affil_No) ~ Study, data = append, max)
append_wide$Affil_Max <- temp[,2]
append_wide$Affil_Max <- factor(append_wide$Affil_Max)
levels(append_wide$Affil_Max) # There is no level 10 for Affil_Max, because even though there was one study with 10 affiliations, this was the same study that had 12, so 10 wasn't counted. Also, level 11 is actually 12 (12 - as factor - converted to 11 - as numeric).


# Quantiles of author numbers - Split at 4, 6, and 10 which represent the 25%, 50% and 75% quantiles for the number of authors on each paper. This categorises author numbers for the modelling
quantile(append_wide$Num_Auth)
append_wide$Cat_Auth <- 0
append_wide$Cat_Auth[append_wide$Num_Auth < 4] <- 1
append_wide$Cat_Auth[append_wide$Num_Auth > 3 & append_wide$Num_Auth < 6] <- 2
append_wide$Cat_Auth[append_wide$Num_Auth > 5 & append_wide$Num_Auth < 10] <- 3
append_wide$Cat_Auth[append_wide$Num_Auth > 9] <- 4
append_wide$Cat_Auth <- factor(append_wide$Cat_Auth)


# # Add Publication Year
# append_wide <- cbind(append_wide, "Year" = bibdf$PY)

# # Normalise total citations variable 
# append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2010,  append_wide$Tot_Cites/7, append_wide$Tot_Cites_Norm)
# append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2011,  append_wide$Tot_Cites/6, append_wide$Tot_Cites_Norm)
# append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2012,  append_wide$Tot_Cites/5, append_wide$Tot_Cites_Norm)
# append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2013,  append_wide$Tot_Cites/4, append_wide$Tot_Cites_Norm)
# append_wide$Tot_Cites_Norm <- ifelse(append_wide$Year == 2014,  append_wide$Tot_Cites/3, append_wide$Tot_Cites_Norm)


# Subset and create DF to have no more than 6 affiliations.
append_wide_sub <- subset(append_wide, as.numeric(Affil_Max) < 7)
append_wide_sub <- append_wide_sub[,-16:-20]
append_wide_sub$Affil_Max <- factor(append_wide_sub$Affil_Max)


#>----------------------------------------------------------

##SUMMARIES----

# Number of articles 
length(append_wide$Study)

# Total citations over 5 years
sum(append_wide$Tot_Cites)
# Total author appearances over 5 years - These are not unique 
sum(append_wide$Num_Auth)
# Total affiliations over 5 years
sum(append_wide$Num_Affil)


# Maximum number of citations on a paper
max(append_wide$Tot_Cites)
# Maximum normalised citations over 5 years
max(append_wide$Tot_Cites_Norm)
# Minimum number of citations on a paper
min(append_wide$Tot_Cites)
# Average number of citations per paper
mean(append_wide$Tot_Cites)
# Average normalized number of citations per paper
mean(append_wide$Tot_Cites_Norm)
# Median number of citations per paper
median(append_wide$Tot_Cites)
# Median normalized number of citations per paper
median(append_wide$Tot_Cites_Norm)


# Maximum number of authors on a paper
max(append_wide$Num_Auth)
# Minimum number of authors on a paper
min(append_wide$Num_Auth)
# Average number of authors per paper
mean(append_wide$Num_Auth)
# Median number of authors per paper
median(append_wide$Num_Auth)


# Maximum number of affiliations on a paper
max(append_wide$Num_Affil)
# Minimum number of affiliations on a paper
min(append_wide$Num_Affil)
# Average number of affiliations per paper
mean(append_wide$Num_Affil)
# Median number of affiliations per paper
median(append_wide$Num_Affil)


# Relative fractions (RFs) of authors for each affiliation
append_wide$RF_1 <- append_wide$Affil_1/append_wide$Num_Auth
append_wide$RF_2 <- append_wide$Affil_2/append_wide$Num_Auth
append_wide$RF_3 <- append_wide$Affil_3/append_wide$Num_Auth
append_wide$RF_4 <- append_wide$Affil_4/append_wide$Num_Auth
append_wide$RF_5 <- append_wide$Affil_5/append_wide$Num_Auth
append_wide$RF_6 <- append_wide$Affil_6/append_wide$Num_Auth
RF_means <- map(append_wide[21:26], mean)
RF_means <- unlist(RF_means)
RF_av_percent <- RF_means*100


# Table giving number of papers in each category (Max_Affil * Cat_Auth)
source("")
xtabf <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "f") # Frequency
xtabj <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "j") # Percent
xtable(xtabf$crosstab, digits=0)
xtable(xtabj$crosstab)


# Diff number of authors - most vs least cited
t.test(Num_Auth ~ Cite_Split, append_wide)
# Diff number of affiliations - most vs least cited
t.test(Num_Affil ~ Cite_Split, append_wide)

# Diff in number of authors for Affil=1
t.test(Affil_1 ~ Cite_Split, append_wide) 
# Diff in number of authors for Affil=2
t.test(Affil_2 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=3
t.test(Affil_3 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=4
t.test(Affil_4 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=5
t.test(Affil_5 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=6
t.test(Affil_6 ~ Cite_Split, append_wide)


# Number of papers in most-cited group
length(append_wide$Num_Affil[append_wide$Cite_Split==1])
# Number of papers in least-cited group
length(append_wide$Num_Affil[append_wide$Cite_Split==0])


# Compare ** number of papers** across each category of affiliation - not mutually exclusive i.e. a paper may have more than one affiliation category and will therefore be counted more than once. Also, Max Affil = 1 does not add up to 27612, because not all papers had at least one Max Affil. There may have been some papers that had only two Max Affils or more.
tabfreq <- table(append$Affil_No, append$Cite_Split)
xtable(tabfreq)

(most_vec <- tabfreq[,2]) # Frequency of affiliations in most cited half of all papers
(total_vec <- tabfreq[,1] + tabfreq[,2]) # Total frequency of affiliations across all papers
prop.test(most_vec, total_vec) # Proportions test (equivalent to Chi-Square)
prop.trend.test(most_vec, total_vec) # Proportions trend test


# Compare ** number of authors** across each category of affiliation (Table 1)
lowcite <- subset(append_wide, Cite_Split == 0)
highcite <- subset(append_wide, Cite_Split == 1)
lowcite_sum <- map(lowcite[,10:20], sum)
lowcite_sum <- unlist(lowcite_sum)
sum(lowcite_sum)
highcite_sum <- map(highcite[,10:20], sum)
highcite_sum <- unlist(highcite_sum)
sum(highcite_sum)
row_tot <- lowcite_sum + highcite_sum
row_perc <- row_tot/sum(row_tot)
col_tot <- c(sum(lowcite_sum), sum(highcite_sum), sum(row_tot), 1) 
col_perc <- col_tot/sum(row_tot)
citedf <- data.frame(lowcite_sum, highcite_sum, row_tot, row_perc)
citedf <-  rbind(citedf[1:10,], c(0, 0, 0, 0) , citedf[-(1:10),]) #Insert row for Affil 11
citedf <- rbind(citedf, col_tot, col_perc)
citedf$row_perc[14] <- NA
rownames(citedf)[13] <- "col_tot"
rownames(citedf)[14] <- "col_perc"
xtable(citedf, digits=0)

prop.test(citedf$highcite_sum[1:8], citedf$tot[1:8]) # Proportions test (equivalent to Chi-Square)
prop.trend.test(citedf$highcite_sum[1:6], citedf$tot[1:6]) # Proportions trend test


#>----------------------------------------------------------

##MODELLING----

# # LM
# summary(mod1a <- lm(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub))
# # Log-Linear
# summary(mod1b <- lm(log(Tot_Cites+1) ~ Cat_Auth + Affil_Max, data=append_wide_sub))
# mod1b$coefficients*100
# # NB
# summary(mod1c <- glm.nb(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub))
# exp(mod1c$coef)
# # Poisson
# summary(mod2 <- glm(Tot_Cites ~ Cat_Auth + Affil_Max, family="poisson", data=append_wide_sub))
# exp(mod2$coef)
# # Compare
# pchisq(2 * (logLik(mod1c) - logLik(mod2)), df = 1, lower.tail = FALSE) # Chi-square high indicating NB better fit.
# # Compare conditional means and variances of Tot_Cites within each level of Affil_Max. Variances exceed means suggesting overdispersion. More evidence that NB better model to fit.
# with(append_wide_sub, tapply(Tot_Cites, Affil_Max, function(x) {sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))}))
# 
# # Logistic
# summary(mod1d <- glm(Cite_Split ~ Cat_Auth + Affil_Max, data=append_wide_sub, family = "binomial"))
# exp(mod1d$coef)

# LM - Interaction
append_wide_sub$Journal <- factor(append_wide_sub$Journal,levels(append_wide_sub$Journal)[c(2,1,4,3)])

summary(mod1e <- lm(Tot_Cites_Norm ~ Affil_Max + Cat_Auth + Affil_Max:Cat_Auth + Year + Journal, data=append_wide_sub))
summary(mod1f <- lm(Tot_Cites_Norm ~ Affil_Max + Cat_Auth +  Year + Journal, data=append_wide_sub))
# Global test for dropping interaction
anova(mod1e, mod1f)

# What is the reference grid set at
ref_grid(mod1e)

# Marginal Means
(mod1e_em <- emmeans(mod1e, ~ Affil_Max + Cat_Auth))
# emmip(mod1e, Affil_Max ~ Cat_Auth)

# Test contrasts
(LF <- contrast(mod1e_em, list(au1af2 = c(-1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au1af3 = c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au1af4 = c(-1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                             
                              au2af2 = c(0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au2af3 = c(0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au2af4 = c(0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                             
                              au3af2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au3af3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              au3af4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
                              
                              au4af2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0),
                              au4af3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0),
                              au4af4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0),
                              
                              af1au2 = c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af1au3 = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af1au4 = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
                              
                              af2au2 = c(0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af2au3 = c(0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af2au4 = c(0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
                              
                              af3au2 = c(0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af3au3 = c(0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af3au4 = c(0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
                              
                              af4au2 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                              af4au3 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
                              af4au4 = c(0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0)
                                )))
confint(LF)
test(LF)

# Print Supplementary Table
library(broom)
library(flextable)
library(officer)
modtab <- tidy(mod1e)
modci <- data.frame(confint(mod1e))
modtab2 <- cbind(modtab[, 1:3], modci, modtab[, 4:5])
(ft <- regulartable(modtab2) %>%
    set_formatter_type(fmt_double = "%0.2f"))

doc <- read_docx() %>% 
  body_add_flextable(value = ft)
print(doc, target = "") %>% invisible()


# Global test for dropping interaction
L <- matrix(c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
# Anova - should be same as contrast
anova(mod1a, mod1e)

# Test Contrasts
# Auth Num = 1, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 1, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 1, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 2, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 2, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 2, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,1,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 3, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 3, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 3, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 4, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,1,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 4, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 4, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 1, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 1, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 1, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 2, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 2, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 2, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,1,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 3, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,1,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 3, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 3, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 4, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,0,1,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 4, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 4, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Predictions
nd <- cov_pat6[,1:2]
# cbind predictions and SE's for 3 models
nd <- cbind(nd, Mean_LM = predict(mod1a, newdata = nd, type = "response"), SE_LM = predict(mod1a, newdata = nd, type="response", se.fit = T)$se.fit, Mean_LogLM = predict(mod1b, newdata = nd, type = "response"), SE_LogLM = predict(mod1b, newdata = nd, type="response", se.fit = T)$se.fit, Mean_NB = predict(mod1c, newdata = nd, type = "response"), SE_NB = predict(mod1c, newdata = nd, type="response", se.fit = T)$se.fit)
mean_cites <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub, mean)
# Back Transform menas and SE's for loglinear mod
nd2 <- cbind(nd, Mean_LogLM_Back = exp(nd[,5] + 0.5*summary(mod1b)$sigma^2), SE_LogLM_Back = exp(nd[,5])*nd[,6], Mean_Cites = mean_cites[,3])
# DF of means and CI's
nd_ci <- cbind(nd2[,1:2], "Mean_Cites" = nd2[,11], "LM_lower" = nd2[,3]-(1.96*nd2[,4]), "LM_mean" =  nd2[,3], "LM_upper" = nd2[,3]+(1.96*nd2[,4]), "LogLM_lower" = nd2[,9]-(1.96*nd2[,10]), "LogLM_mean" =  nd2[,9], "LogLM_upper" = nd2[,9]+(1.96*nd2[,10]), "NB_lower" = nd2[,7]-(1.96*nd2[,8]), "NB_mean" =  nd2[,7], "NB_upper" = nd2[,7]+(1.96*nd2[,8]))
# Convert to long for plotting
nd_ci_long <- data.frame(cbind("Cat_Auth" = rep(nd_ci$Cat_Auth,4), "Affil_Max" = rep(nd_ci$Affil_Max,4), "Model" = rep(c(1:4), each=24), "Mean" = c(nd_ci$Mean_Cites, nd_ci$LM_mean, nd_ci$LogLM_mean, nd_ci$NB_mean), "Lower" = c(rep(NA, 24), nd_ci$LM_lower, nd_ci$LogLM_lower, nd_ci$NB_lower), "Upper" = c(rep(NA, 24), nd_ci$LM_upper, nd_ci$LogLM_upper, nd_ci$NB_upper)))


# Format Model Output
# LM
cia <- as.data.frame(confint(mod1a))
mod1a_out <- cbind(mod1a$coefficients, coef(summary(mod1a))[,2], cia[,1], cia[,2], coef(summary(mod1a))[,3],  coef(summary(mod1a))[,4])
mod1a_out <- data.frame(mod1a_out)
colnames(mod1a_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value")
xtable(mod1a_out, digits=3)
# Log-LM
cib <- as.data.frame(confint(mod1b))
mod1b_out <- cbind(mod1b$coefficients, coef(summary(mod1b))[,2], cib[,1], cib[,2], coef(summary(mod1b))[,3],  coef(summary(mod1b))[,4])
mod1b_out <- data.frame(mod1b_out)
colnames(mod1b_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value")
xtable(mod1b_out, digits=3)
# NB
cic <- as.data.frame(exp(confint(mod1c)))
mod1c_out <- cbind(mod1c$coefficients, exp(mod1c$coefficients), coef(summary(mod1c))[,2], cic[,1], cic[,2], coef(summary(mod1c))[,3],  coef(summary(mod1c))[,4])
mod1c_out <- data.frame(mod1c_out)
colnames(mod1c_out) <- c("beta", "Rate", "S.E.", "2.5%", "97.5%","Z", "p-value")
xtable(mod1c_out, digits=3)
# Check fit - p value interpretation (should be > 0.05 for good fit)
1 - pchisq(mod1c$deviance, mod1c$df.residual)
# Check fit - resid dev interpretation (should not be above this value for a good fit)
qchisq(0.95, df.residual(mod1c))
deviance(mod1c)


#>----------------------------------------------------------

##PLOTS----

# HISTOGRAMS
# Hist Citations
hist_1 <- ggplot(append_wide, aes(Tot_Cites)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Citations/Paper",breaks=seq(0,500,50), limits=c(0, 500)) +
  labs(y = "Frequency")
# Hist Authors
hist_2 <- ggplot(append_wide, aes(Num_Auth)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Authors/Paper",breaks=seq(0,50,5), limits=c(0, 50)) +
  labs(y = "Frequency")
# Hist Affiliations
hist_3 <- ggplot(append_wide, aes(Num_Affil)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Affiliations/Paper",breaks=seq(0,50,5), limits=c(0, 50)) +
  labs(y = "Frequency")

plot_grid(hist_1, hist_2, hist_3, align='h', ncol = 1, labels=c('A', 'B', 'C'))


med.n <- function(x){
  return(c(y = median(x)+2, label = round(median(x),0))) 
  # experiment with the multiplier to find the perfect position
}
mean.n <- function(x){
  return(c(y = -2, label = round(mean(x),0))) 
  # experiment with the multiplier to find the perfect position
}
# BOXPLOTS
ggplot(append_wide_sub, aes(Affil_Max, Tot_Cites_Norm)) +
geom_boxplot(outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 100)) +
facet_grid(~Cat_Auth) + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    axis.line = element_line(colour = "gray50", 
        size = 0.5, linetype = "solid"), 
    panel.background = element_rect(fill = NA)) + labs(x = "Maximum Affiliation", y = "Normalised Citation Count") +
  stat_summary(fun.data = mean.n, geom = "text") + 
  stat_summary(fun.data = med.n, geom = "text")


# CORRELATIONS
cor_df <- append_wide_sub[, c(4,6,7,14,8:13)]
cor_df$Affil_Max <- as.numeric(cor_df$Affil_Max)
cor_df$Cat_Auth <- as.numeric(cor_df$Cat_Auth)
names(cor_df)=c("Citations", "Affiliations","Authors","Max. Affiliation","Affiliation = 1","Affiliation = 2","Affiliation = 3","Affiliation = 4","Affiliation = 5","Affiliation = 6")
cor_mat <- cor(cor_df, use="complete.obs")

source("")
rquery.cormat(cor_df)


# BALLOON PLOT
# Creating covariate pattern from ungrouped data
test <- append_wide[,c(4,19,20)]
temp <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data = test, sum)
# Create zero values for the missing covariate patterns
temp <- rbind(temp, c(1,8,0))
temp <- rbind(temp, c(2,8,0))
temp <- rbind(temp, c(1,9,0))
temp <- rbind(temp, c(2,9,0))
temp <- rbind(temp, c(1,11,0))
temp <- rbind(temp, c(2,11,0))
temp <- rbind(temp, c(3,11,0))

# Only first 6 affiliations.
cov_pat6 <- temp[1:24,]
# Add offset - number of papers per category
cov_pat6$offs <- c(3142, 2715, 2898, 1387, 1371, 2207, 3845, 3374, 454, 811, 1509, 1859, 103, 210, 419, 695, 24, 61, 119, 250, 4, 9, 35, 64)

# P 110 of R Graphics Cookbook - Models summed citations - from individual paper data. 
ggplot(cov_pat6, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites)) +
  geom_point(shape=21, colour="black", fill="cornsilk") +
  scale_size_area(max_size=40, guide=F) +
  labs(x = "Maximum Affiliation", y = "Author Number") +
  geom_text(aes(label=Tot_Cites), vjust=7, colour="grey50", size=4)

# # Expected? summed citations if number of papers in each category was the same (i.e. original data -> fewer papers with higher Max_Affil)
# a <- table(append_wide_sub$Cat_Auth,append_wide_sub$Affil_Max)
# b <- as.data.frame.matrix(a)
# c <- max(b)/b
# d <- c(c[,1],c[,2],c[,3],c[,4],c[,5],c[,6])
# e <- cbind(cov_pat6, "adj" = d)
# e$Tot_Cites2 <- e$Tot_Cites*e$adj
# ggplot(e, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites2)) +
#   geom_point(shape=21, colour="black", fill="cornsilk") +
#   scale_size_area(max_size=40, guide=F) +
#   labs(x = "Maximum Affiliation", y = "Author Category") +
#   geom_text(aes(label=Tot_Cites2), vjust=7, colour="grey50", size=4)


# LINE PLOTS
# With CI's
nd_ci_long %>%
  mutate(Model = as.factor(Model)) %>%
  ggplot(aes(Affil_Max, Mean)) + 
  geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) +
  scale_linetype_manual(values=c(2,1,1,1)) +
  geom_ribbon(alpha = .3, aes(ymin = Lower, ymax = Upper, 
                              group = Model, fill = Model)) +
  scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) +
  scale_fill_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) +
  facet_grid(. ~ Cat_Auth)

# Without CI's
# Create gridline cuts
minorsy <- seq(50,200,by=10)
minorsx <- seq(1,6,by=1)

nd_ci_long %>%
  mutate(Model = as.factor(Model)) %>%
  ggplot(aes(Affil_Max, Mean)) + 
  geom_hline(mapping=NULL, yintercept=minorsy,colour='grey95') +
  geom_vline(mapping=NULL, xintercept=minorsx,colour='grey95') +
  geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) +
  scale_linetype_manual(values=c(2,1,1,1)) +
  scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3"), labels=c("Data", "Linear Model", "LogLinear Model", "Negative Binomial Model")) +
  facet_grid(. ~ Cat_Auth) +
  scale_x_continuous("Maximum Affiliation", breaks=seq(1,6,1)) +
  scale_y_continuous("Citations", breaks=seq(0,200,10), limits=c(50, 200)) +
  theme(legend.position="top", legend.title=element_blank()) +
  guides(color = guide_legend(override.aes = list(linetype = c(2,1,1,1))), linetype = FALSE)