#R code for antibody validation analysis

#Load required libraries ----
library(readxl)
library(foreign)
library(tidyverse)
library(tidyr)
library(dplyr)
library(broom)
library(irr)
library(psych)

#load data sets ----
antibodies <-read_excel(".../Supplemental Data S1.xlsx") #load here the antibody data set (supplemental data S1) 
reliability <-read_excel(".../Supplemental Data S2.xlsx") #load here the interrater reliability data set (supplemental data S2)

#In the data sets the value 1 means yes, 0 means no, 9 means not applicable, 999 means not clear

# Data transformation ----
#interrater reliability data
reliability$J_how_many_validated[reliability$J_how_many_validated=="na"]<-"0" #code answer not applicable as 0 for this variable
reliability$W_how_many_validated[reliability$W_how_many_validated=="na"]<-"0" #code answer not applicable as 0 for this variable
reliability[sapply(reliability, is.character)]<-lapply(reliability[sapply(reliability, is.character)],as.factor)
reliability[sapply(reliability, function(x) as.character(x) %in% c("na","?"))] <- NA #the answers 'not applicable' and 'not clear' are coded as missing values for all other variables
reliability$J_how_many_used <-as.numeric(as.character(reliability$J_how_many_used))
reliability$W_how_many_used <-as.numeric(as.character(reliability$W_how_many_used))
reliability$J_how_many_validated <-as.numeric(as.character(reliability$J_how_many_validated))
reliability$W_how_many_validated <-as.numeric(as.character(reliability$W_how_many_validated))

#antibody data
antibodies$How_many_abs_validated<-as.numeric(as.character(antibodies$How_many_abs_validated))
antibodies$How_many_abs_validated[antibodies$How_many_abs_validated==999]<-NA #answer 'not clear = 999' is classiefied as missing

antibodies$How_many_different_abs_used<-as.numeric(as.character(antibodies$How_many_different_abs_used))
antibodies$How_many_different_abs_used[antibodies$How_many_different_abs_used==999]<-NA #answer 'not clear = 999' is classiefied as missing

antibodies$percentage_abs_validated<-as.numeric(as.character(antibodies$percentage_abs_validated))
antibodies$percentage_abs_validated[antibodies$percentage_abs_validated==999]<-NA #answer 'not clear = 999' is classiefied as missing

#The following comands are used to create separate datasets for variables that need to be analysed in same way later on  
results<-gather(antibodies, key = "type", value = "validated",Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no) 
results2<-gather(antibodies, key = "type", value = "validated",Secondary_without_primary, Spatial_localization, Preadsorption, MW_WB, Reference_validation_supplier, Reference_validation_literature, Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no, Validation_by_researcher)
results3<-gather(antibodies, key = "type", value = "validated",Secondary_without_primary, Spatial_localization, Preadsorption, Reference_validation_literature, Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no)
results4<-gather(antibodies, key = "type", value = "validated",Secondary_without_primary, Spatial_localization, Preadsorption, MW_WB, Reference_validation_literature, Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no)
results5<-gather(antibodies, key = "type", value = "validated",Secondary_without_primary, Spatial_localization, MW_WB, Reference_validation_supplier, Reference_validation_literature, Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no)
results6<-gather(antibodies, key = "type", value = "validated",Secondary_without_primary, Spatial_localization, Preadsorption, MW_WB, Reference_validation_supplier, Reference_validation_literature, Basic_info_all_primary_abs_complete_na_no, Validation_info_present, All_primary_ab_validated_na_no)

#Missing values: 9=na, 999=unclear, both will be defined as missing values here
results$validated[results$validated==9 |results$validated==999]<-NA
results2$validated[results2$validated==9 |results2$validated==999]<-NA
results3$validated[results3$validated==9 |results3$validated==999]<-NA
results4$validated[results4$validated==9 |results4$validated==999]<-NA
results5$validated[results5$validated==9 |results5$validated==999]<-NA
results6$validated[results6$validated==9 |results6$validated==999]<-NA

#interrater reliability ----
#calculate cohen's kappa and the percentage agreement between assessments of two raters:
cohen.kappa(x=cbind(reliability$J_val_info_present,reliability$W_val_info_present),alpha=.05); agree(cbind(reliability$J_val_info_present,reliability$W_val_info_present)) #for any validation information present of at least one antibody
cohen.kappa(x=cbind(reliability$J_basic_info_all_primary_complete,reliability$W_basic_info_all_primary_complete), alpha =.05); agree(cbind(reliability$J_basic_info_all_primary_complete,reliability$W_basic_info_all_primary_complete)) #for basic information all primary antibodies complete
cohen.kappa(x=cbind(reliability$J_all_primary_validated_na_no,reliability$W_all_primary_validated_na_no), alpha =.05); agree(cbind(reliability$J_all_primary_validated_na_no,reliability$W_all_primary_validated_na_no)) #for all primary antibodies validated
cohen.kappa(x=cbind(reliability$J_ref_supplier,reliability$W_ref_supplier), alpha =.05); agree(cbind(reliability$J_ref_supplier,reliability$W_ref_supplier)) #for validation by reference to antibody supplier
cohen.kappa(x=cbind(reliability$J_ref_lit,reliability$W_ref_lit), alpha =.05);agree(cbind(reliability$J_ref_lit,reliability$W_ref_lit)) #for validation by reference to the literature
cohen.kappa(x=cbind(reliability$J_ref_database,reliability$W_ref_database), alpha =.05);agree(cbind(reliability$J_ref_database,reliability$W_ref_database)) #for validation by reference to database
cohen.kappa(x=cbind(reliability$J_Val_by_researcher,reliability$W_Val_by_researcher), alpha =.05);agree(cbind(reliability$J_Val_by_researcher,reliability$W_Val_by_researcher)) #for validation carried out by authors of the article
cohen.kappa(x=cbind(reliability$J_MW_WB,reliability$W_MW_WB), alpha =.05);agree(cbind(reliability$J_MW_WB,reliability$W_MW_WB)) #for validation method: molecular weight similar to target in WB
cohen.kappa(x=cbind(reliability$J_spatial_loc,reliability$W_spatial_loc), alpha =.05);agree(cbind(reliability$J_spatial_loc,reliability$W_spatial_loc)) #for validation method: spatial localization
cohen.kappa(x=cbind(reliability$J_preadsorption,reliability$W_preadsorption), alpha =.05);agree(cbind(reliability$J_preadsorption,reliability$W_preadsorption)) #for validation method: preadsorption
cohen.kappa(x=cbind(reliability$J_sec_without_prim,reliability$W_sec_without_prim), alpha =.05);agree(cbind(reliability$J_sec_without_prim,reliability$W_sec_without_prim)) #for validation method: secondary antibody without primary 
cohen.kappa(x=cbind(reliability$J_five_pillar,reliability$W_five_pillar), alpha =.05);agree(cbind(reliability$J_five_pillar,reliability$W_five_pillar)) #for validation method: five pillars
cohen.kappa(x=cbind(reliability$J_positive_control,reliability$W_positive_control), alpha =.05);agree(cbind(reliability$J_positive_control,reliability$W_positive_control)) #for validation method: positive control
cohen.kappa(x=cbind(reliability$J_negative_control,reliability$W_negative_control), alpha =.05);agree(cbind(reliability$J_negative_control,reliability$W_negative_control)) #for validation method: negative control
cohen.kappa(x=cbind(reliability$J_other_total,reliability$W_other_total), alpha =.05);agree(cbind(reliability$J_other_total,reliability$W_other_total)) #for all other validation methods

#intra-class correlation
icc(cbind(reliability$J_how_many_used,reliability$W_how_many_used),model="twoway", unit="single")
icc(cbind(reliability$J_how_many_validated,reliability$W_how_many_validated),model="twoway", unit="single")
(J_percentage_validated<-reliability$J_how_many_validated/reliability$J_how_many_used *100) #calculate the percentage of validated antibodies for each evaluated article
(W_percentage_validated<-reliability$W_how_many_validated/reliability$W_how_many_used *100) #calculate the percentage of validated antibodies for each evaluated article
icc(cbind(J_percentage_validated,W_percentage_validated),model="twoway", unit="single")


#analysis antibody data ----
#tables ----

#percentage of validated antibodies per article (data of figure 2): 
antibodies %>% 
  group_by(Journal, after_guidelines)%>% #compare samples of journals before and after the introduction of guidelines 
  summarise_at(vars(percentage_abs_validated), list(~mean(., na.rm=TRUE)))

percentage_validated_table<-data.frame(antibodies$Journal, antibodies$after_guidelines, antibodies$percentage_abs_validated, antibodies$How_many_abs_validated, antibodies$How_many_different_abs_used) #(data for supplemental table 1)

#proportion and count tables for each category:
#JCN
JCNB <-filter(results2, Journal == "JCN" & after_guidelines == 0) #filter results for journal JCN before introduction of guidelines
JCNA <-filter(results2, Journal == "JCN" & after_guidelines == 1) #filter results for journal JCN after introduction of guidelines

prop.table(table(JCNB$type,JCNB$validated),1)#proportion table of JCN before introduction of guidelines
prop.table(table(JCNA$type,JCNA$validated),1)#proportion table of JCN after introduction of guidelines
table(JCNB$type,JCNB$validated) # count table JCN before
table(JCNA$type,JCNA$validated) # count table JCN after

#Nature
NatureB <- filter(results2, Journal == "Nature" & after_guidelines==0) #filter results for journal Nature before introduction of guidelines
NatureA <-filter(results2, Journal == "Nature" & after_guidelines==1) #filter results for journal Nature after introduction of guidelines

prop.table(table(NatureB$type,NatureB$validated),1)#proportion table of Nature before introduction of guidelines
prop.table(table(NatureA$type,NatureA$validated),1)#proportion table of Nature after introduction of guidelines
table(NatureB$type,NatureB$validated) # count table Nature before
table(NatureA$type,NatureA$validated) # count table Nature after

#Total with guidelines
withB<-filter(results2, with_guidelines == 1 & after_guidelines==0) #filter results for journals with guidelines before introduction of guidelines
withA<-filter(results2, with_guidelines == 1 & after_guidelines==1) #filter results for journals with guidelines after introduction of guidelines

prop.table(table(withB$type,withB$validated),1)#proportion table of journals with guidelines before introduction of guidelines
prop.table(table(withA$type,withA$validated),1)#proportion table of journals with guidelines after introduction of guidelines
table(withB$type,withB$validated) # count table journals with guidelines before
table(withA$type,withA$validated) # count table journals with guidelines after

#Neuroscience
NeuroB <-filter(results2, Journal == "Neuroscience" & after_guidelines==0) #filter results for journal Neuroscience before introduction of guidelines
NeuroA <-filter(results2, Journal == "Neuroscience" & after_guidelines==1) #filter results for journal Neuroscience after introduction of guidelines

prop.table(table(NeuroB$type,NeuroB$validated),1)#proportion table of Neuroscience before introduction of guidelines
prop.table(table(NeuroA$type,NeuroA$validated),1)#proportion table of Neuroscience after introduction of guidelines
table(NeuroB$type,NeuroB$validated) # count table Neuroscience before
table(NeuroA$type,NeuroA$validated) # count table Neuroscience after

#Science
ScienceB <-filter(results2, Journal == "Science" & after_guidelines==0) #filter results for journal Science before introduction of guidelines
ScienceA <-filter(results2, Journal == "Science" & after_guidelines==1) #filter results for journal Science after introduction of guidelines

prop.table(table(ScienceB$type,ScienceB$validated),1)#proportion table of Science before introduction of guidelines
prop.table(table(ScienceA$type,ScienceA$validated),1)#proportion table of Science after introduction of guidelines
table(ScienceB$type,ScienceB$validated) # count table Science before
table(ScienceA$type,ScienceA$validated) # count table Science after

#Total without guidelines
withoutB<-filter(results2, with_guidelines == 0 & after_guidelines==0) #filter results for journals without guidelines before introduction of guidelines
withoutA<-filter(results2, with_guidelines == 0 & after_guidelines==1) #filter results for journals without guidelines after introduction of guidelines

prop.table(table(withoutB$type,withoutB$validated),1)#proportion table of journals without guidelines before introduction of guidelines
prop.table(table(withoutA$type,withoutA$validated),1)#proportion table of journals without guidelines after introduction of guidelines
table(withoutB$type,withoutB$validated) # count table journals without guidelines before
table(withoutA$type,withoutA$validated) # count table journals without guidelines after

#total, all articles 
before <- filter(results2, after_guidelines ==0) #filter results before guidelines
after <- filter(results2, after_guidelines ==1) #filter results after guidelines

prop.table(table(before$type,before$validated),1)#proportion table of all journals before introduction of guidelines
prop.table(table(after$type,after$validated),1)#proportion table of all journals after introduction of guidelines
table(before$type,before$validated) # count table all journals before
table(after$type,after$validated) # count table all journals after

#fisher exact test for difference in validation before and after guideline introduction ----

(JWithG_BA<- filter(results5, with_guidelines ==1)%>% #journals with guidelines
   group_by(with_guidelines,type) %>%
   do(tidy(fisher.test(.$after_guidelines, .$validated, alternative ="greater", conf.int=TRUE, conf.level=0.95))))

(JCNNat_BA<- filter(results, with_guidelines ==1) %>% #JCN and Nature
    group_by(Journal,type) %>%
    do(tidy(fisher.test(.$after_guidelines, .$validated, alternative ="greater", conf.int=TRUE, conf.level=0.95))))

(JWHG_BA<- filter(results3, with_guidelines ==0) %>% #journals without guidelines -> two sided test
    group_by(with_guidelines,type) %>%
    do(tidy(fisher.test(.$after_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

(SciNeu_BA<- filter(results, with_guidelines ==0) %>% #Science and Neuroscience -> two sided test
    group_by(Journal,type) %>%
    do(tidy(fisher.test(.$after_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

(all_BA<- results %>% #all journals -> two sided test
    group_by(type) %>%
    do(tidy(fisher.test(.$after_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

#fisher exact test for difference in validation between similar journals with and without guidelines ----

(WWB<- filter(results4, after_guidelines ==0) %>% #journals with guidelines versus journals without guidelines, before
   group_by(type) %>%
   do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

(WWA<- filter(results6, after_guidelines ==1) %>% #journals with guidelines versus journals without guidelines, after
    group_by(type) %>%
    do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="greater", conf.int=TRUE, conf.level=0.95))))

(JCNNeu_WWB<- filter(results, (Journal == "JCN" | Journal=="Neuroscience") & after_guidelines == 0) %>% #JCN versus Neuroscience, before
    group_by(type) %>%
    do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

(JCNNeu_WWA<- filter(results, (Journal == "JCN" | Journal=="Neuroscience") & after_guidelines == 1) %>% #JCN versus Neuroscience, after
    group_by(type) %>%
    do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="greater", conf.int=TRUE, conf.level=0.95))))

(NatSci_WWB<- filter(results, (Journal == "Nature"|Journal=="Science")&after_guidelines==0) %>% #Nature versus Science, before
    group_by(type) %>%
    do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="two.sided", conf.int=TRUE, conf.level=0.95))))

(NatSci_WWA<- filter(results, (Journal == "Nature"|Journal=="Science")&after_guidelines==1) %>% #Nature versus Science, after
    group_by(type) %>%
    do(tidy(fisher.test(.$with_guidelines, .$validated, alternative ="greater", conf.int=TRUE, conf.level=0.95))))

#holm-bonferroni correction----

(total<- rbind(JCNNat_BA,JWithG_BA,SciNeu_BA,JWHG_BA,all_BA, WWB, WWA, JCNNeu_WWB, JCNNeu_WWA, NatSci_WWB, NatSci_WWA )) #collect all p values 

padjusted<-p.adjust(total$p.value, method="holm", n=length(total$p.value))
(pvalues<-data.frame(total$with_guidelines, total$Journal, total$type, total$alternative, total$p.value,padjusted, total$estimate, total$conf.low, total$conf.high)) #dataframe with all statistical tests, odds ratio's, p-values and adjusted p-values