#####################################################################################################
### Evolutionary patterns of range size, abundance and species richness in Amazonian angiosperm trees
### Kyle Dexter
### University of Edinburgh
### kgdexter@gmail.com
### July 7, 2016
#####################################################################################################


###Required packages
library(ape)
library(geiger)
library(phytools)


###Datasets to import

##Data set on range size of Amazonian plants from Feeley and Silman. 2009. PNAS
#download from http://www.pnas.org/content/106/30/12382.abstract?tab=ds
FS_data_raw <- FS_data <- read.table("ST2.txt",header=T,stringsAsFactors=F)

#clean up a few names that had special characters
FS_data[FS_data$SPECIES.NAME=="Mimosa_chaco+\xbdnsis",1] <- "Mimosa_chacoensis"
FS_data[FS_data$SPECIES.NAME=="Mimosa_morro+\xbdnsis",1] <- "Mimosa_morroensis"
FS_data[FS_data$SPECIES.NAME=="Raveniopsis_araca+\xbdnsis",1] <- "Raveniopsis_aracaensis"

#add a genus column and log-transform range size
FS_data$Genus <- vector("character",nrow(FS_data))
for (i in 1:nrow(FS_data)){
	FS_data$Genus[i] <- unlist(strsplit(FS_data$SPECIES.NAME[i],split="_"))[1]
	print(i)
}
FS_data$Genus <- as.factor(FS_data$Genus)
FS_data$log10rangesize <- log10(FS_data$RANGE.AREA.KM2)

#Calculate genus means for range size
RangeSize <- tapply(FS_data$log10rangesize,list(Genus=FS_data$Genus),FUN=mean)

#Calculate species richness for each genus and log-transform
SppRichness <- log10(summary(FS_data$Genus,maxsum=50000))


##Data set on abundance of Amazonian trees from ter Steege et al. 2013. Science
#download from http://science.sciencemag.org/content/suppl/2013/10/16/342.6156.1243092.DC1
#save worksheet 
TS_data_raw <- TS_data <- read.csv("terSteege_Appendix.csv",stringsAsFactors=F)
TS_data$Genus <- vector("character",nrow(TS_data))
for (i in 1:nrow(TS_data)){
	TS_data$Genus[i] <- unlist(strsplit(TS_data$Accepted_species[i],split="_"))[1]
	print(i)
}
TS_data$Genus <- as.factor(TS_data$Genus)
TS_data$log10abundance <- log10(TS_data$est.ind)

#Calculate genus means for abundance
Abundance <- tapply(TS_data$log10abundance,list(Genus=TS_data$Genus),FUN=mean)


##List of Neotropical tree genera produced by Rick Condit and colleagues at STRI based on plots throughout Neotropics
#download from http://ctfs.si.edu/webatlas/TreeDatabaseUtilities/GenusList.php and save as tab-delimited text file
#first delete one line of data that has question marks (an extra species of Eschweilera?)
STRI_data_raw <- STRI_data <- read.table("STRI_genera.txt",header=T,stringsAsFactors=F)

#fix a few name mismatches in this list
STRI_data$Genus[which(STRI_data$Genus=="Diploön")] <- "Diploon"
STRI_data$Genus[which(STRI_data$Genus=="Monnieria")] <- "Monniera"
STRI_data$Genus[which(STRI_data$Genus=="Chondodendron")] <- "Chondrodendron"


##In order to extract a list of Amazonian tree genera, I intersected the list of genera from the STRI website (for all Neotropical trees) with that from Feeley and Silman (for all Amazonian plants)
Amazon_tree_genera <- rownames(Genus_data)[which(rownames(Genus_data)%in%STRI_data$Genus)]

#I then united this with the list from ter Steege et al. which has better coverage for trees in the Amazon than the STRI dataset and may have additional genera
Amazon_tree_genera <- unique(c(Amazon_tree_genera,names(Abundance)))
#This is the list we used to search Genbank to try and include as many genera as possible in the phylogeny
write.table(Amazon_tree_genera,"Amazon_tree_genera.txt",row.names=F,quote=F)


##The phylogeny was generated entirely outside of the R Statistical software following methods detailed in the main text of the paper
#Import the phylogeny
phylog <- read.tree("Amazon_genus_phylog.tre")
#drop outgroups
phylog <- drop.tip(phylog,c("Amborella","Nymphaea"))


###Fitting different evolutionary models for each of the genus-level attributes compiled above
#start with species richness, make data match up first
phylog_FS <- drop.tip(phylog,which(!phylog$tip.label%in%names(SppRichness)))
SppRichness_sub <- SppRichness[match(phylog_FS$tip.label,names(SppRichness))]

(SppRichness_BM <- fitContinuous(phylog_FS,SppRichness_sub,model="BM")) # delta AIC, -287.807
(SppRichness_lambda <- fitContinuous(phylog_FS,SppRichness_sub,model="lambda")) # lambda, 0.261226
(SppRichness_white <- fitContinuous(phylog_FS,SppRichness_sub,model="white")) # delta AIC, -17.09426

#then range size
RangeSize_sub <- RangeSize[match(phylog_FS$tip.label,names(RangeSize))]

(RangeSize_BM <- fitContinuous(phylog_FS,RangeSize_sub,model="BM")) # delta AIC, -257.9293
(RangeSize_lambda <- fitContinuous(phylog_FS,RangeSize_sub,model="lambda")) # lambda, 0.369979
(RangeSize_white <- fitContinuous(phylog_FS,RangeSize_sub,model="white")) # delta AIC, -52.04149

#then abundance
phylog_TS <- drop.tip(phylog,which(!phylog$tip.label%in%names(Abundance)))
Abundance_sub <- Abundance[match(phylog_TS$tip.label,names(Abundance))]

(Abundance_BM <- fitContinuous(phylog_TS,Abundance_sub,model="BM")) # delta AIC, -287.5399
(Abundance_lambda <- fitContinuous(phylog_TS,Abundance_sub,model="lambda")) # lambda, 0.322649
(Abundance_white <- fitContinuous(phylog_TS,Abundance_sub,model="white")) # delta AIC, -34.24044


###Assessing relationships among genus-level attributes
##First, assemble dataset with data for all three genus-level attributes
Genus_data_tmp <- as.data.frame(cbind(SppRichness,RangeSize))
Genus_data_tmp$Genus <- rownames(Genus_data_tmp)
Abundance_df <- as.data.frame(Abundance)
Abundance_df$Genus <- names(Abundance)
Genus_data_all <- merge(Genus_data_tmp,Abundance_df,by="Genus",all=T)
Genus_data_all <- Genus_data_all[which(Genus_data_all$Genus%in%Amazon_tree_genera),]
Genus_data_complete  <- merge(Genus_data_tmp,Abundance_df,by="Genus")

#then cut data down to that which we have for both
phylog_sub <- drop.tip(phylog,which(!phylog$tip.label%in%Genus_data_complete$Genus))
Genus_data_sub <- Genus_data_complete[match(phylog_sub$tip.label,Genus_data_complete$Genus),]

#calculate the phylogenetically independent contrasts for each genus-level characteristic
pic_richness <- pic(Genus_data_sub[,2],phylog_sub)
pic_range <- pic(Genus_data_sub[,3],phylog_sub)
pic_abundance <- pic(Genus_data_sub[,4],phylog_sub)

#assess relationships between phylogenetically independent contrasts, forcing the relationship through the origin 
pic_richnessXrange <- lm(pic_richness~pic_range-1) 
pic_richnessXabundance <- lm(pic_richness~pic_abundance-1)
pic_abundanceXrange <- lm(pic_abundance~pic_range-1)
summary(pic_richnessXrange) # r = 0.2840775
summary(pic_richnessXabundance) # r = 0.2377814
summary(pic_abundanceXrange) # r = 0.4327817

#compare this to standard Pearson correlation coefficient
cor(Genus_data_sub$SppRichness,Genus_data_sub$RangeSize) # r = -0.396348

cor(Genus_data_sub$SppRichness,Genus_data_sub$Abundance) # r = -0.3725245

cor(Genus_data_sub$Abundance,Genus_data_sub$RangeSize) # r = 0.4429052


###Performing test of whether individual nodes show greater or smaller values than expected by chance for reconstructed values of genus-level attributes
#first write function to perform the statistical test
test_nodes <- function(tree,trait,nsims){
	tree2use <- drop.tip(tree,which(!tree$tip.label%in%names(trait)))
	trait2use <- trait[match(tree2use$tip.label,names(trait))]
	true_ace <- fastAnc(tree2use,trait2use)
	sim_results <- matrix(NA,length(true_ace),nsims)
	for (i in 1:nsims){
		randomtrait <- sample(trait2use,length(trait2use),replace=F)
		names(randomtrait) <- names(trait2use)
		sim_results[,i] <- fastAnc(tree2use,randomtrait)
		print(paste("Randomisation",i))
	}
	p_value <- vector("numeric",length(true_ace))
	for (j in 1:length(true_ace)){
		p_value[j] <- length(which(true_ace[j] > sim_results[j,]))/(nsims+1)
	}
	out <- cbind(as.numeric(names(true_ace)),true_ace,p_value)
	colnames(out)[1] <- "Node"
	out <- as.data.frame(out)
	return(out)
}

#running function for each of the genus-level attributes
SppRichness_testresults <- test_nodes(phylog_FS,SppRichness_sub,999)
RangeSize_testresults <- test_nodes(phylog_FS,RangeSize_sub,999)
Abundance_testresults <- test_nodes(phylog,Abundance_sub,999)


##now match these results up with clade names
#first import taxonomy table
taxonomy <- read.csv("taxonomy_table.csv")

#reorganise this information
taxonomy2 <- matrix(NA,1,3)
colnames(taxonomy2) <- c("Genus","Clade","Category")
for (i in 2:ncol(taxonomy)){
	tmp <- na.omit(taxonomy[,c(1,i)])
	colnames(tmp) <- c("Genus","Clade")
	tmp_clades <- summary(as.factor(tmp$Clade),maxsum=1000)
	tmp_clades <- tmp_clades[which(tmp_clades>1)]
	tmp <- tmp[which(tmp$Clade%in%names(tmp_clades)),]
	tmp$Category <- colnames(taxonomy)[i]
	taxonomy2 <- rbind(taxonomy2,tmp)
}
taxonomy2 <- taxonomy2[-1,]


##match up with phylogeny reduced to genera in Feeley and Silman dataset first
#first get reduced taxonomy table for genera in Feeley and Silman dataset
taxonomy2_FS <- taxonomy2[which(taxonomy2$Genus%in%phylog_FS$tip.label),]
tmp <- summary(as.factor(taxonomy2_FS$Clade),maxsum=1000)
tmp1 <- names(tmp)[which(tmp==1)]
taxonomy2_FS <- taxonomy2_FS[which(!taxonomy2_FS$Clade%in%tmp1),]
#obtain the clades in that dataset
clades_FS <- unique(taxonomy2_FS$Clade)

#create a matrix for storing node numbers to match with output from test_nodes
phylog_FS_clades <- matrix(NA,length(clades_FS),3)
phylog_FS_clades[,1] <- clades_FS
colnames(phylog_FS_clades) <- c("Clade","Category","Node")
for (i in 1:length(clades_FS)){
	phylog_FS_clades[i,2] <- unique(taxonomy2_FS$Category[which(taxonomy2_FS$Clade==clades_FS[i])])
	tmp <- taxonomy2_FS$Genus[which(taxonomy2_FS$Clade==clades_FS[i])]
	phylog_FS_clades[i,3] <- getMRCA(phylog_FS,tmp)
	print(i)
}
phylog_FS_clades <- as.data.frame(phylog_FS_clades)
#deal with the fact that some clades in the phylogeny have two names (e.g. Arecaceae and Arecales are the same here), choose lower-level name
tmp <- summary(phylog_FS_clades$Node,maxsum=1000)
tmp1 <- names(tmp)[which(tmp>1)]
phylog_FS_clades <- phylog_FS_clades[-match(tmp1,phylog_FS_clades$Node),]
phylog_FS_clades$Node <- as.numeric(as.character(phylog_FS_clades$Node))

#now actually matching these up
colnames(SppRichness_testresults)[2:3] <- paste(colnames(SppRichness_testresults)[2:3],"SppRichness",sep="_")
colnames(RangeSize_testresults)[2:3] <- paste(colnames(RangeSize_testresults)[2:3],"RangeSize",sep="_")
FS_results <- merge(SppRichness_testresults,RangeSize_testresults,by="Node")
FS_results <- merge(phylog_FS_clades,FS_results,by="Node",all=T)


#do the same for ter Steege dataset
taxonomy2_TS <- taxonomy2[which(taxonomy2$Genus%in%phylog_TS$tip.label),]
tmp <- summary(as.factor(taxonomy2_TS$Clade),maxsum=1000)
tmp1 <- names(tmp)[which(tmp==1)]
taxonomy2_TS <- taxonomy2_TS[which(!taxonomy2_TS$Clade%in%tmp1),]
#obtain the clades in that dataset
clades_TS <- unique(taxonomy2_TS$Clade)

#create a matrix for storing node numbers to match with output from test_nodes
phylog_TS_clades <- matrix(NA,length(clades_TS),3)
phylog_TS_clades[,1] <- clades_TS
colnames(phylog_TS_clades) <- c("Clade","Category","Node")
for (i in 1:length(clades_TS)){
	phylog_TS_clades[i,2] <- unique(taxonomy2_TS$Category[which(taxonomy2_TS$Clade==clades_TS[i])])
	tmp <- taxonomy2_TS$Genus[which(taxonomy2_TS$Clade==clades_TS[i])]
	phylog_TS_clades[i,3] <- getMRCA(phylog_TS,tmp)
	print(i)
}
phylog_TS_clades <- as.data.frame(phylog_TS_clades)
#deal with the fact that some clades in the phylogeny have two names (e.g. Arecaceae and Arecales are the same here), choose lower-level name
tmp <- summary(phylog_TS_clades$Node,maxsum=1000)
tmp1 <- names(tmp)[which(tmp>1)]
phylog_TS_clades <- phylog_TS_clades[-match(tmp1,phylog_TS_clades$Node),]
phylog_TS_clades$Node <- as.numeric(as.character(phylog_TS_clades$Node))

#now actually matching these up
colnames(Abundance_testresults)[2:3] <- paste(colnames(Abundance_testresults)[2:3],"Abundance",sep="_")
TS_results <- merge(phylog_TS_clades,Abundance_testresults,by="Node",all=T)


#merge all results together and export
FS_results_sub <- FS_results[which(!is.na(FS_results$Clade)),-1]
TS_results_sub <- TS_results[which(!is.na(TS_results$Clade)),-1]
clade_results <- merge(FS_results_sub,TS_results_sub,by=c("Clade","Category"),all=T)
write.csv(clade_results,"Table_S3.csv",row.names=FALSE,quote=F)


###Assessing whether major groups show significant differences for genus-level attributes
combined_data <- merge(taxonomy,Genus_data_all,by="Genus")
plot(SppRichness~major_group,data=combined_data)
summary(lm(SppRichness~major_group,data=combined_data))
TukeyHSD(aov(SppRichness~major_group,data=combined_data))

plot(RangeSize~major_group,data=combined_data)
summary(lm(RangeSize~major_group,data=combined_data))
TukeyHSD(aov(RangeSize~major_group,data=combined_data))

plot(Abundance~major_group,data=combined_data)
summary(lm(Abundance~major_group,data=combined_data))
TukeyHSD(aov(Abundance~major_group,data=combined_data))