#Archive of analyses for PeerJ submission, Puget Sound Urbanization eDNA, July 2016


##########LOAD LIBRARIES and FUNCTIONS#############
libs=c("vegan","plotrix","MASS","DESeq2","data.table", "gdata","lattice","plyr","dplyr", "lme4", "arm", "gridExtra", "ggplot2", "eeptools", "taxize", "lmtest")
	lapply(libs, require, character.only = TRUE)  #"gsubfn"

bit=function(x){print(x[c(1:10),c(1:10)]); print(dim(x))}  #shows top-left corner of a data.frame/matrix; useful for large datasets

source("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/AnalysisFinal/plot.stacked.2.R")  #for stacked area plots
	#create color palette
	gg_colors <- function(n) {
		  hues = seq(15, 375, length=n+1)
		  hcl(h=hues, l=65, c=100)[1:n]
		}
		
REsim <- function(x, whichel=NULL, nsims){
  require(plyr)
  mysim <- sim(x, n.sims = nsims)
  if(missing(whichel)){
    dat <- plyr::adply(mysim@ranef[[1]], c(2, 3), plyr::each(c(mean, median, sd)))
    warning("Only returning 1st random effect because whichel not specified")
  } else{
    dat <- plyr::adply(mysim@ranef[[whichel]], c(2, 3), plyr::each(c(mean, median, sd)))
  }
  return(dat)
}  #function for random effects simulation

###################################################


#Load data
#########
	load("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Analysis_PeerJ/PS_urban_eDNA_data.rData")
		row.names(presAbs_transects)<-transect_means$Sample.ID
#########		
		
		#checking co-variates; marine variables do not strongly vary with urbanization.
		names(URBAN) # data.frame with environmental covariates is called "URBAN"
		res=NA; for (i in c(7,9:63)){res[i]<-summary(lm(URBAN[,8]~URBAN[,i]))$r.squared}
		res=data.frame(names(URBAN), res); res<-res[order(res[,2]),]
		
		
########################################################################
########\subsection*{Sequence Data}  #stats for how many reads, etc
########################################################################
#	TaxonSummary=read.csv("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/AnalysisFinal/TaxonCountsSummaryTable.csv", row.names=1)
		
	sum(DECONTAM) #read counts before rarefaction
		
	sum(master$OTU_abundance)  #total reads passing QC
	sum(master$N_DUPs)  #total number of unique sequences (DUPs) passing QC
	length(master$OTU_name_swarm)  #number of OTUs passing QC
		
	sum(master[!is.na(master$OTU_taxon_phylum),3])  #total reads annotated at some level (i.e., at least phylum)
	sum(master[!is.na(master$OTU_taxon_phylum),1])  #total unique sequences (DUPs) annotated at some level (i.e., at least phylum)
	length(master[!is.na(master$OTU_taxon_phylum),2])  #number of OTUs with some taxonomic assignment

	sum(master[!is.na(master$OTU_taxon_phylum),3])/sum(master$OTU_abundance)  	#fraction of reads with taxonomic assignment
	sum(master[!is.na(master$OTU_taxon_phylum),1])/sum(master$N_DUPs)  #fraction of unique sequences (DUPs) with taxonomic assignment
	length(master[!is.na(master$OTU_taxon_phylum),2])/length(master$OTU_name_swarm)  #fraction of OTUs with some taxonomic assignment
	
	sum(master[!is.na(master$OTU_taxon_family),3])/sum(master$OTU_abundance) #fraction of reads with taxonomy resolved at Family level

	colSums(TaxonSummary[,2:4], na.rm=T) #summary of taxonomy (reads)
		length(unique(master$OTU_taxon_MEGAN))  #total number of unique taxa 

	table(master$OTU_taxon_phylum)  #unique OTUs by phylum
		table(master$OTU_taxon_phylum)/sum(table(master$OTU_taxon_phylum))  #OTUs by phylum, proportions of annotated reads

	sum(master[master$eVal<1e-30,3], na.rm=T)/sum(master$OTU_abundance) #high-confidence annotations (reads)
	length(master[master$eVal<1e-30,3])/nrow(master)  #high-confidence annotations (OTUs, as fraction of all OTUs)
	length(master[master$eVal<1e-30,3])/sum(!is.na(master$OTU_taxon_MEGAN))  #high-confidence annotations (OTUs, as fraction of all annotated OTUs)

		#table of blast hits by evalue.  maybe a set of mini-histograms (eval freq) for OTUs by phylum?
		uniquePhyla=unique(master$OTU_taxon_phylum) ; uniquePhyla= uniquePhyla[!is.na(uniquePhyla)]
		eVal_summary<-list(NA)
		for (i in 1:length(uniquePhyla)){
			eVal_summary[[i]]<- table(master[master$OTU_taxon_phylum== uniquePhyla[i],9])
		}
				q=matrix(NA, nrow= length(uniquePhyla), ncol=11) ; colnames(q)=names(eVal_summary[[1]])
					for (i in 1:10){
						q[i,]=eVal_summary[[i]][match(colnames(q), names(eVal_summary[[i]]))]
					}
					q[is.na(q)]<-0 ; row.names(q)=uniquePhyla; q=q[,order(colnames(q), decreasing=T)]
						#write.csv(q, "/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/OTU_eVals_byPhylum.csv")

		#generate table of all family-level annotations
			a=aggregate(master[,c(3)], by=list(master$OTU_taxon_family), length) ; names(a)=c("Taxon", "OTUs")
			b=aggregate(master[,c(3)], by=list(master$OTU_taxon_family), sum) ; names(b)=c("Taxon", "Reads")
			familySummary= merge(a, b) ; familySummary=familySummary[order(familySummary[,3], decreasing=T),]

		#add natural history to family data
		Fams_w_NatHist=merge(familySummary, eDNANatHist, by="Taxon", all=F)
		Fams_w_NatHist= Fams_w_NatHist[,-c(4,5)]; names(Fams_w_NatHist)[2:3]<-c("OTUs", "Reads")
	#		write.csv(Fams_w_NatHist, "/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/eDNAFamilies_w_NatHist.csv",row.names=F)

		#are more common OTUs more likely to be annotated?
		isAnnotated=rep(1, times=nrow(master))
		isAnnotated[which(is.na(master$OTU_taxon_MEGAN))]<-0
		
		wilcox.test(master[isAnnotated==1,3], master[isAnnotated==0,3])


		#is there simply more DNA in the water at more urbanized sites?  #no
			a=merge(meta, completeMetadata) ; a=a[a$PSeelgrassSite=="Y",] ; names(a)[9]="Site.code"
				b=data.frame(URBAN$Site.code, URBAN$Area.Weighted.Mean.Percent.Imperviousness) ; names(b)=c("Site.code", "Imperv")
				c=merge(a, b, "Site.code")
			summary(lm(mass_DNA_total_ng~ Imperv, data=c))


########################################################################
##Alpha diversity mean number of OTUs/families per site, using transects as replicates
########################################################################
	DNA_site_MeanRichness = aggregate(specnumber(presAbs_transects), by=list(substr(row.names(presAbs_transects),1,3)), mean)[,2]
	mean_eDNA_families=aggregate(specnumber(families), list(substr(row.names(presAbs_transects), 1,3)), mean)[,2]


#also calculate total richness at each site, in case it's useful later
	DNA_site_TotalRichness= aggregate(presAbs_transects, by=list(substr(row.names(presAbs_transects),1,3)), sum)
		DNA_site_TotalRichness= DNA_site_TotalRichness[,-1] ; DNA_site_TotalRichness<-specnumber(DNA_site_TotalRichness)
		names(DNA_site_TotalRichness)<-unique(substr(row.names(presAbs_transects),1,3))
	DNA_TotalFamilies=specnumber(aggregate(families, list(substr(row.names(families), 1,3)), sum))


########################################################################
####BETA Diversity
#note beta diversity means many things to many people (see Anderson 2011 Ecol Letters, Koleff et al. 2003 J. Animal Ecol).  Here, looking at how different transects are within sites
########################################################################
	betadiv= temp[,dna_index] ; betadiv[betadiv>0]<-1 #create pres-abs df for beta diversity analysis	
	distance=NA; betaList=list(NA)  #create empty objects for results .  betaList will keep all pairwise interactions ; "distance" will use a parallel calculation for the same idea
		index=1
		for (i in unique(temp$Site.code)){
			site.df=betadiv[temp$Site.code%in%i,]
				betaList[[index]]<-as.vector(betadiver(site.df, method=1))
				distance[index]=mean(as.vector(vegdist(site.df, method="jaccard")))
				index=index+1
			}

	beta.df<-as.data.frame(betaList) ; names(beta.df)<-unique(temp$Site.code) ;beta.df[2:3,3]<-NA  #save as data frame; replace repeated values as NA to reflect fact that a two-sample comparison has only a single beta value (that sample didn't have all three transects amplify)
	
	beta<-colMeans(beta.df, na.rm=T)  #keep means as "beta"

	#plot(beta~unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness), pch=19, col='black', ylim=c(0.4,.95))
	#for (i in 1:length(unique(temp$Site.code))){
	#	points(beta.df[,i]~rep(unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness)[i], times=length(!is.na(beta.df[,i]))))
	#}

	##between sites within categories; could use mean of all cross-site pairwise transect comparisons.  here, using site-wide richness to compare one site to another.
	siteOTUvectors<-aggregate(presAbs_transects, by=list(substr(row.names(presAbs_transects),1,3)), sum) ; row.names(siteOTUvectors)<-siteOTUvectors[,1]
		siteOTUvectors<-siteOTUvectors[,-1] ; siteOTUvectors[siteOTUvectors>0]<-1 #convert to PresAbs
			urbanCats<-cofactor.df$category.x[match(row.names(siteOTUvectors), cofactor.df$Site.code)]
 		moreUrbanSiteOTUs<-siteOTUvectors[urbanCats=="more urban",]
 		lessUrbanSiteOTUs<-siteOTUvectors[urbanCats=="less urban",] 			
				#calculate betas
					MoreUrbanSiteBetas<-as.vector(vegdist(moreUrbanSiteOTUs, method="jaccard"))
					LessUrbanSiteBetas<-as.vector(vegdist(lessUrbanSiteOTUs, method="jaccard"))
					AllSiteBetas<-as.vector(vegdist(siteOTUvectors, method="jaccard"))

				BetweenSiteBetas<-data.frame(c(MoreUrbanSiteBetas, LessUrbanSiteBetas, AllSiteBetas),c(rep("More Urban", length(MoreUrbanSiteBetas)), rep("Less Urban", length(LessUrbanSiteBetas)), rep("All Sites", length(AllSiteBetas)))) ; names(BetweenSiteBetas)<-c("WhittakersBeta", "SiteCategory")
				boxplot(WhittakersBeta~SiteCategory, data=BetweenSiteBetas)

				wilcox.test(MoreUrbanSiteBetas, LessUrbanSiteBetas) ; summary(MoreUrbanSiteBetas);  summary(LessUrbanSiteBetas)


	#RaupCrick accounts for the fact that an increase in richness can, itself, explain a change in beta diversity.  This metric estimates beta diversity by permuting the underlying matrix of occurrences of taxa across sites and across taxa, and using this permuted matrix as a null model.  In effect, it asks, "given the arrangement of alpha diversity that we see, what beta diversity would we see by chance alone?"  Here, I did this to make sure that our beta diversity trend is real.  It seems that the trend is real, because beta diversity still declines with imperviousness, independent of changes in richness.
	raupCrick_dna=NA
	raupCrick_matrix= as.matrix(raupcrick(temp[,dna_index], null="swap"))
		for (i in 1:length(unique(temp$Site.code))){
			colindex=which(substr(colnames(raupCrick_matrix),1,3)==unique(temp$Site.code)[i])
				site.df= raupCrick_matrix[colindex, colindex]
			raupCrick_dna[i]=mean(site.df[lower.tri(site.df)])
		}

	plot(raupCrick_dna~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #raup-Crick essentially maintains beta diversity relationship, although this depends upon the null model used. Models r0 and r1 generate nonsensical results.  Here, using "swap", the algorithm permutes the original binary data matrix while leaving row and col sums intact, preserving the probabilities of picking a given taxon at random.

########################################################################
##GAMMA diversity
########################################################################
			gamma.df=data.frame(DNA_site_TotalRichness,unique(cofactor.df[,c(1,7)])[,2]) ; names(gamma.df)<-c("GammaDiversity","UrbanCategory") ; gamma.df=data.frame(gamma.df, unique(cofactor.df[,c(1,6,1737)]))
				#plot(GammaDiversity~ UrbanCategory, data=gamma.df, col=gg_colors(2)[as.numeric(as.factor(gamma.df$UrbanCategory))], ylab="Gamma Diversity (OTUs/site)", xlab="Urban Category")  #if we're looking at total OTUs per site, rather than regional diversity

	#how many OTUs/Families total in more/less urban sites?
	moreurban=presAbs_transects[cofactor.df$category.x=="more urban",]
	lessurban=presAbs_transects[cofactor.df$category.x=="less urban",]
		sum(colSums(lessurban)>0) ; sum(colSums(moreurban)>0)  #total OTUs in each category of sites
		sum(colSums(families[cofactor.df$category.x=="more urban",])>0) ; sum(colSums(families[cofactor.df$category.x=="less urban",])>0)


########################################################################
#########site pairs
########################################################################


	OTUrichnessvec=NA  #here, counting richness of the site as the mean number of unique OTUs that occur transects in that site (i.e., alpha diversity)
	Manualrichnessvec=NA  #here, counting richness of the site as the number of unique OTUs that occur at any transect in that site (i.e., overall occurrence)
	sitenamevec=NA
		for (i in unique(substr(row.names(temp),1,3))){
			site.df= temp[substr(row.names(temp), 1, 3)%in%i,]
				Manualrichnessvec <-c(Manualrichnessvec, specnumber(colSums(site.df[,manual_index])))
				OTUrichnessvec <-c(OTUrichnessvec, mean(specnumber(site.df[,dna_index])))
			sitenamevec<-c(sitenamevec, i)
			}
	Manualrichnessvec = Manualrichnessvec[-1] ; sitenamevec = sitenamevec[-1] ; OTUrichnessvec = OTUrichnessvec[-1]
		deltaRichness.df=data.frame(sitenamevec, OTUrichnessvec, Manualrichnessvec) ; names(deltaRichness.df)=c("Site.code", "OTUrichness", "Manualrichness")
			urbanizationdata=envir[,c(3,6,7,10, 13, 14)]
			deltaRichness.df = merge(deltaRichness.df, urbanizationdata) ; deltaRichness.df = merge(deltaRichness.df, URBAN[,c(2,13,29,31)]) ; deltaRichness.df =unique(deltaRichness.df)


########################################################################
#PLOTTING DIVERSITY
####Combined Figure2 Plot. alpha, beta, gamma diversity
########################################################################

	#necessary calcs for the following figure
		mean_eDNA_families=aggregate(specnumber(families), list(substr(row.names(families), 1,3)), mean)[,2]
		deltaRichness.df=data.frame(deltaRichness.df, unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)); names(deltaRichness.df)[12]="Imperv"
		betaSitePairs=data.frame(unique(cofactor.df[,c(1,6,1737)]), beta)
#			gamma.df=data.frame(DNA_site_TotalRichness,unique(cofactor.df[,c(1,7)])[,2]) ; names(gamma.df)<-c("GammaDiversity","UrbanCategory") ; gamma.df=data.frame(gamma.df, unique(cofactor.df[,c(1,6,1737)]))


def.par <- par(no.readonly = TRUE) # save default, for resetting...


nf <- layout(matrix(c(1,2,3,4,5,5), 3, 2, byrow = TRUE), widths = rep.int(1.5, 2), respect = TRUE)
#layout.show(nf)

	par(mgp=c(2,.5,0)) #distance between axis titles and the axes themselves

#SUB1 #eDNA OTU richness and Family Richness
	
	par(mar=c(1,4,1,4))
	plot(specnumber(cofactor.df[,dna_index])~ cofactor.df $Area.Weighted.Mean.Percent.Imperviousness, pch=19, ylab="Alpha Diversity (OTUs/Transect)", xlab="", col=gg_colors(2)[2], cex=0.5, xlim=c(1,45), axes=F) ; axis(1, labels=F) ; axis(2, las=2) ; box(bty="o")
		points(unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness),DNA_site_MeanRichness, pch=19, col=gg_colors(2)[2], cex=1.2)
		abline(lm(DNA_site_MeanRichness ~ unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness)), col=gg_colors(1), lwd=2)

		#scaling factor to plot N families on same plot : min(specnumber(cofactor.df[,dna_index]))/min(specnumber(families))  #5.3
		points(specnumber(families)*5.3~ I(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness+2), col='lightgrey', pch=19, cex=0.5)  #eDNA
			points(I(unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness)+2), mean_eDNA_families*5.3, pch=19, col='lightgrey', cex=1.2)
			abline(lm(mean_eDNA_families*5.3~I(unique(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness)+2)), lty=2, col=gg_colors(1))
		axis(4, seq(50,350,50), labels=round(seq(50,350,50)/5.3))
		mtext("Taxonomic Family Richness", side=4, line=2, cex=0.7)
		legend("topleft", c("OTUs", "Taxonomic Families"), fill=c(gg_colors(2)[2], "lightgrey"), cex=.6)

#SUB2  #richness dot-and-line
#NOTE: needs original data points added (not just means)
	par(mar=c(1,4,1,4))
			plot(deltaRichness.df $OTUrichness ~ deltaRichness.df$Imperv, pch=19, type="n", ylim=c(0.9*min(OTUrichnessvec),1.1*max(OTUrichnessvec)), xlab="Watershed Imperviousness (Percent)", ylab="Mean Alpha Diversity (OTUs/Transect)", xlim=c(1,45), axes=F) ; axis(1, labels=F) ; axis(2, las=2) ; box(bty="o")
								#abline(lm(deltaRichness.df$OTUrichness~deltaRichness.df$Imperv), col="black", lty=2)
			for(i in 1:length(unique(deltaRichness.df$pair.number))){
			site.df= deltaRichness.df[deltaRichness.df$pair.number==unique(deltaRichness.df$pair.number)[i],]
				points(site.df$OTUrichness ~site.df$Imperv, pch=19, col=gg_colors(4)[i])
					lines(site.df$Imperv,site.df$OTUrichness, col=gg_colors(4)[i])
					#text(site.df$PC1.74,site.df$OTUrichness +0.5, i)
			}
				legend("topleft", c("RB-SM","BG-CW","PC-CC","SI-MA"), fill=gg_colors(4), cex=0.6)


#SUB3  Beta Div by Imperviousness
	par(mar=c(3,4,1,4))
	g=data.frame(betaMeans.df[,1:1000],unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)) ; names(g)=c(paste0('sample', 1:1000), "Imperv")
	g=reshape(g, varying=list(1:1000), direction="long")
	#it is a huge pain to do a boxplot with an x-axis proportional to the numeric values of the grouping factors...
	xvals=round(g$Imperv,1)

	boxplot(g$sample1 ~ 
		factor(xvals, levels = seq(min(xvals), max(xvals),0.1)),
		data = g, 
		xaxt = 'n',
		boxwex=10,
		cex=.5,
		border=gg_colors(2)[2],
		xlab="Watershed Imperviousness (Percent)",
		ylab="Transect-Level Beta Diversity"
		) 
	axis(1, at = c(-5,84,184,284,384), labels=c(0,10,20,30,40)) 
		lin.mod=lm(sort(rowMeans(betaMeans.df), decreasing=T)~c(1,8,25,56,238,321,336,406)) #note I needed to mess with x-values for plotting here
	abline(lin.mod, col=gg_colors(1), lwd=2)


#SUB4	Pairs, beta dot-and-line (needs original data points added, not just means)
	par(mar=c(3,4,1,4))
			plot(betaSitePairs $beta ~ betaSitePairs $Area.Weighted.Mean.Percent.Imperviousness, pch=19, type="n", ylim=c(0.9*min(betaSitePairs$beta),1.1*max(betaSitePairs$beta)), xlab="Watershed Imperviousness (Percent)", ylab="Mean Transect-Level Beta Diversity", xlim=c(1,45), axes=F) ; axis(1, labels=T) ; axis(2, las=2) ; box(bty="o")
								#abline(lm(betaSitePairs $beta~ betaSitePairs $Area.Weighted.Mean.Percent.Imperviousness), col="black", lty=2)
			for(i in 1:length(unique(betaSitePairs $pair.number.x))){
			site.df= betaSitePairs[betaSitePairs $pair.number.x==unique(betaSitePairs $pair.number.x)[i],]
				points(site.df$beta ~site.df$Area.Weighted.Mean.Percent.Imperviousness, pch=19, col=gg_colors(4)[i])
					lines(site.df$Area.Weighted.Mean.Percent.Imperviousness,site.df$beta, col=gg_colors(4)[i])
					#text(site.df$PC1.74,site.df$OTUrichness +0.5, i)
			}
				legend("topright", c("RB-SM","BG-CW","PC-CC","SI-MA"), fill=gg_colors(4), cex=0.6)


#SUB5-6 ; gamma accumulation; depends on /Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Analysis_UrbanizationSecondSubmission/GammaDiversity_accumulationCurve.R
	boxplot(gammaResultsAll ~ c(rep(1:8, rep(NsamplesPerSite,8))), ylim=c(100,1800), border =gg_colors(3)[1], xlab="N Sites", ylab="Unique OTUs")
			ydata= gammaResultsAll; xdata= c(rep(1:8, rep(NsamplesPerSite,8)))
		nls.mod=nls(ydata~a*log(xdata)+b, start=list(a=600, b=300))
			a=coefficients(nls.mod)[1] ; b=coefficients(nls.mod)[2]
		curve(a*log(x)+b, xlim=c(1,8), add=T, col=gg_colors(3)[1], lwd=2)
	
	boxplot(gammaResultsMoreUrban ~ c(rep(1:4, rep(NsamplesPerSite,4))), add=T, border =gg_colors(3)[2])
		ydata= gammaResultsMoreUrban; xdata= c(rep(1:4, rep(NsamplesPerSite,4)))
		nls.mod=nls(ydata~a*log(xdata)+b, start=list(a=600, b=300))
			a=coefficients(nls.mod)[1] ; b=coefficients(nls.mod)[2]
		curve(a*log(x)+b, xlim=c(1,4), add=T, col=gg_colors(3)[2], lwd=2)

	boxplot(gammaResultsLessUrban ~ c(rep(1:4, rep(NsamplesPerSite,4))), add=T, border =gg_colors(3)[3])
		ydata= gammaResultsLessUrban; xdata= c(rep(1:4, rep(NsamplesPerSite,4)))
		nls.mod=nls(ydata~a*log(xdata)+b, start=list(a=600, b=300))
			a=coefficients(nls.mod)[1] ; b=coefficients(nls.mod)[2]
		curve(a*log(x)+b, xlim=c(1,4), add=T, col=gg_colors(3)[3], lwd=2)

	legend("bottomright",legend=c("All Sites", "More Urban Sites", "Less Urban Sites"), fill=gg_colors(3))


par(def.par)  #- reset to default


########################################################################
##Regressions with Imperviousness
########################################################################
###Imperviousness Correlations w OTU richness and Family Richness
	summary(lm(DNA_site_MeanRichness~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))) #main correlation, using site means.
	summary(glm(round(mean_eDNA_families)~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), family='poisson')) #number of taxonomic Families w imperviousness (using Poisson with site means, because of small numbers of counts)


#rare vs. common OTUs
#rare OTUs actually increase in relative frequency at high imperviousness
		common=temp[,dna_index][,colSums(temp[,dna_index])>100]
		rare=temp[,dna_index][,colSums(temp[,dna_index])<11]
			summary(lm(I(specnumber(rare)/specnumber(common))~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))


########################################################################
###########mixed-effects models
########################################################################
	#Set up dataframe with OTU richness, Imperv surface cover, covariates, and Family richness
		rareRichness = specnumber(temp[,dna_index], MARGIN=1)  #note: transect-level
		Imperv=cofactor.df$Area.Weighted.Mean.Percent.Imperviousness
			rareRichness=data.frame(names(rareRichness), rareRichness) ; names(rareRichness)[1]="Sample.ID"
	LMEtemp=merge(temp[,1:7], rareRichness, "Sample.ID", all.x=F)  #add to analysis df
		LMEtemp =data.frame(LMEtemp, specnumber(families)) ; names(LMEtemp)[9]="FamilyRichness"
						xyplot(rareRichness ~ category | pair.number, data= LMEtemp)  #vizualize the problem
						xyplot(FamilyRichness ~ category | pair.number, data= LMEtemp)  #vizualize the problem
					

	#LME for richness by imperviousness, 
		#syntax:  responseVar~fixed+(random) ; the number 1 indicates the intercept as a variable, such that random var (1|pair.number) means that each pair has its own intercept (but the same slope), and random var (Imperv|pair.number) estimates a separate error term the Imperv slope for each pair.
		
		
		(RandInterceptFixedSlope=lmer(rareRichness ~ Imperv + (1|pair.number) + (1 | Site.code), data= LMEtemp, REML=T))
	
		ranef(RandInterceptFixedSlope, condVar=TRUE, whichel = "pair.number")  #show the random effects
		fixef(RandInterceptFixedSlope, condVar=TRUE, whichel = "pair.number")  #show the fixed effect
		REsim(RandInterceptFixedSlope, whichel = "pair.number", nsims = 1000)  #simulate the random effects

		1-var(residuals(RandInterceptFixedSlope))/(var(model.response(model.frame(RandInterceptFixedSlope))))  #like an r-squared; see http://stats.stackexchange.com/questions/95054/how-to-get-the-overall-effect-for-linear-mixed-model-in-lme4-in-r


########################################################################
### INDIVIDUAL OTUs, pres/abs correlations with Imperviousness  [LOGISTIC REGRESSION]  ; see /Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Analysis/DiversityAnalysis/DESeq2_working.R for deseq analysis of abundances, if interested
########################################################################
		namevec=NA ; pvalvec=NA ; logitvec=NA ; AICvec=NA ; ORvec=NA
			for (i in 1:ncol(presAbs_transects)){
				namevec[i]=names(presAbs_transects)[i]
					mod=glm(presAbs_transects[,i]~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, family = "binomial"(link='logit'))
				pvalvec[i]=summary(mod)$coefficients[8]
				logitvec[i]=summary(mod)$coefficients[2]
				AICvec[i]=AIC(mod)
				ORvec[i]=exp(summary(mod)$coefficients[2])
			}		
		results=data.frame(namevec, pvalvec, logitvec, AICvec, ORvec)  ;  results=results[order(results[,5], decreasing=T),]
		
		#OTUs most positively correlated with Imperv :
		increasing=subset(results, results$logitvec>0&results$pvalvec<0.05)  ; nrow(increasing)
		a= masterDECONTAM[match(increasing$namevec, masterDECONTAM $OTU_name_swarm),]  #bivalves and echinoids
		table(a$OTU_taxon_family) ; table(a$OTU_taxon_class)

				#null hypothesis: are these taxa overrepresented, relative to their proportion in the overall dataset?
				table(master$OTU_taxon_family)
				#e.g., Mytilus has 14% of OTUs in cleaned database:  217/sum(table(littleMaster$OTU_taxon_family))

		#OTUs most negatively correlated with Imperv :
		decreasing=subset(results, results$logitvec<0&results$pvalvec<0.05)
		b= masterDECONTAM[match(decreasing$namevec, masterDECONTAM $OTU_name_swarm),]  #barnacles, but nearly nothing
		table(b$OTU_taxon_family)


########################################################################
###number of families per class or order
########################################################################
		fams_temp=merge(cofactor.df, families1, "Sample.ID", all.x=F)  #use 'phyla1', 'classes1', etc... from above ; This shows how many unique OTUs in Mytilidae, e.g., occurred in each transect
			row.names(fams_temp)= cofactor.df $Sample.ID
		fams2classes=lineages[match(names(fams_temp[,1793:ncol(fams_temp)]), lineages$OTU_taxon_family),7]		
		fams2orders=lineages[match(names(fams_temp[,1793:ncol(fams_temp)]), lineages$OTU_taxon_family),6]

		#Example analysis:
		clams=subset(masterDECONTAM, masterDECONTAM$OTU_taxon_class=="Bivalvia") 

		df=data.frame(cofactor.df $Area.Weighted.Mean.Percent.Imperviousness,specnumber(cofactor.df[,(names(cofactor.df)%in%clams$OTU_name_swarm)])) ; names(df)=c("Imperv","BivalveOTUrichness")
			mean(df[df[,1]<10, 2]) #mean number bivalve OTUs at imperv less than 10
			mean(df[df[,1]>25, 2]) #mean number bivalve OTUs at imperv greater than 25
			summary(glm(BivalveOTUrichness ~Imperv, family="poisson", data=df))
	
		df=data.frame(specnumber(fams_temp[, which(fams2classes=="Bivalvia")+1792]), cofactor.df$Area.Weighted.Mean.Percent.Imperviousness) ; names(df)=c("Richness", "Imperv")
			mean(df[df[,2]<10, 1]) #mean number bivalve families at imperv less than 10
			mean(df[df[,2]>25, 1]) #mean number bivalve families at imperv greater than 25
			summary(glm(Richness ~Imperv, family="poisson", data=df))


########################################################################
########\subsection*{eDNA Variance Within and Among Sites}
########################################################################
		#PERMANOVA   #using presence/absence version of normalized counts, with no transect-level averaging
			distmat=vegdist(presAbs, method="jaccard") #bray and jaccard give very similar answers.  euclidian gives more variance to the residuals.
				transects=substr(row.names(presAbs), 1,5)
				sites=substr(row.names(presAbs), 1,3)
				adonis(distmat~sites+transects)
				
				#with count data rather than presence/Absence
				distmat=vegdist(t(rareData), method="bray") #bray and jaccard give very similar answers.  euclidian gives more variance to the residuals.
				transects=substr(row.names(t(rareData)), 1,5)
				sites=substr(row.names(t(rareData)), 1,3)
				adonis(distmat~sites+transects)

########################################################################
#constrained correspondence analysis; NOTE: could constrain by site pair number or by urban category; both very strong results
########################################################################
		#transect-level ordination
		dna.cca=cca(presAbs_transects~cofactor.df$category.x)
			ordiplot(dna.cca, display="sites")
			samplingsites=substr(row.names(presAbs_transects),1,3)
				ordihull(dna.cca,groups= samplingsites,draw="polygon",col="grey90",label=F)
				orditorp(dna.cca,display="sites")

		dna.pca=prcomp(presAbs_transects)
#			pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/SecondSubmission/SupplFig_DNA_PCA.pdf", width=7, height=7)
			ordiplot(dna.pca, display="sites", main="PCA on eDNA OTU Data\nPresence-Absence")
			samplingsites=substr(row.names(presAbs_transects),1,3)
				# ordihull(dna.pca,groups= samplingsites,draw="polygon",col="grey90",label=F)
				# orditorp(dna.pca,display="sites")
			ordihull(dna.pca,groups= samplingsites,draw="polygon",col= gg_colors(2)[1],label=F, show.groups=unique(samplingsites)[c(1,5,6,7)])
			ordihull(dna.pca,groups= samplingsites,draw="polygon",col= gg_colors(2)[2],label=F, show.groups=unique(samplingsites)[c(2:4,8)])
				orditorp(dna.pca,display="sites")
										legend("topright", c("More Urban", "Less Urban"), fill=gg_colors(2), cex=0.8)
#			dev.off()

		###Site-level ordination

		a=aggregate(presAbs_transects, by=list(substr(row.names(presAbs_transects),1,3)), sum) ; a=a[,-1]
			a[a>0]<-1
		dna.cca=cca(a~unique(cofactor.df[,c(1,7)])[,2])


########################################################################
####Functional Diversity Analysis
########################################################################
natHist_index=c(5:14)
	functionalDiv_Families=matrix(NA, nrow(families), length(natHist_index)); index=1
		for (i in unique(row.names(families))){
			transect.df=families[i, families[i,]>0]
			functionalDiv_Families[index,]<-colSums(eDNANatHist[match(names(transect.df), eDNANatHist[,1]), natHist_index])  #just counts each family once, no matter how many OTUs present at a transect
				index=index+1
			}
		functionalDiv_Families =as.data.frame(functionalDiv_Families) ; row.names(functionalDiv_Families)<-row.names(families) ; colnames(functionalDiv_Families)<-names(eDNANatHist)[natHist_index]    #Note that each family can/will fit into more than one functional category (e.g., epifauna and mobile)


##COUNT data to answer question, how common are non-marine species in the data? 

sum(eDNANatHist[eDNANatHist$Habitat.terrestrial==1| eDNANatHist$Habitat.freshwater==1,3]) #total freshwater and terrestrial reads, including human; use [2:37] to exclude human
sum(eDNANatHist[eDNANatHist$Habitat.intertidal==1| eDNANatHist$Habitat.subtidal==1,3]) #total marine reads

	#Plotting functional groups
	plot(rowSums(functionalDiv_Families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)  #crude measure of functional diversity; summing rows
	
		par(mfrow=c(5,2))
		par(mar=c(1,2,1,1))
		par(mgp=c(1,.5,0))
		for (i in 1:ncol(functionalDiv_Families)){
		plot(functionalDiv_Families[,i]~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, main="", pch=19, ylab="") 
			title(names(functionalDiv_Families)[i], line=-7)
		mod=lm(functionalDiv_Families[,i]~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness) #; if(summary(mod)$coefficients[8]<0.05) abline(mod, col='red') else abline(mod, col='lightgrey')
		
		
		ydata=functionalDiv_Families[,i]
		xdata=cofactor.df$Area.Weighted.Mean.Percent.Imperviousness
		nonlin.mod=nls(ydata~a*(xdata^2)+b*xdata+c, start=list(a=1, b=1, c=1))
			#lines(sort(xdata), predict(nonlin.mod)[order(xdata)])
		
		if(lrtest(mod, nonlin.mod)$Pr[2]<0.05) lines(sort(xdata), predict(nonlin.mod)[order(xdata)]) else if(summary(mod)$coefficients[8]<0.05) abline(mod, col='red') else abline(mod, col='lightgrey')
		}


###Assigning Niche Categories
		NatHistAttributes=NA; for(i in 1:10){NatHistAttributes[i]<-strsplit(names(eDNANatHist)[natHist_index], "\\.")[[i]][2]}
		
		TheoreticalFunctionalNiches=expand.grid(NatHistAttributes[1:4], NatHistAttributes[5:8], NatHistAttributes[9:10])
			TheoreticalFunctionalNiches <- TheoreticalFunctionalNiches[-c(3,4,19,20,24,28,32),] #remove logically impossible niches
				row.names(TheoreticalFunctionalNiches)<-1:nrow(TheoreticalFunctionalNiches)
		
		bestmatch=NA
		listMatches=list(NA)
		for (i in 1:nrow(eDNANatHist)){
			test=NatHistAttributes[eDNANatHist[i, natHist_index]==1]
			for (j in 1:nrow(TheoreticalFunctionalNiches)){bestmatch[j]<-sum(test%in%unlist(TheoreticalFunctionalNiches[j,]))}	
			listMatches[[i]]<-TheoreticalFunctionalNiches[bestmatch==3, 1:3]
		}
		
		niche=list(NA)
		for (i in 1:nrow(eDNANatHist)){
			for (j in 1:nrow(listMatches[[i]])){
				if(nrow(listMatches[[i]])==1) niche[[i]]<-paste(as.character(unlist(listMatches[[i]])), collapse="_")
				if(nrow(listMatches[[i]])>1) if(j==1) niche[[i]]<-paste(as.character(unlist(listMatches[[i]][j,])), collapse="_")
					if(j>1) niche[[i]][j]<-paste(as.character(unlist(listMatches[[i]][j,])), collapse="_")
			}
		}
		
		observedNicheNames=unique(unlist(niche))
		observedNiches=as.data.frame(matrix(0, nrow=nrow(eDNANatHist), ncol=length(observedNicheNames)))
		for (i in 1:nrow(eDNANatHist)){
			observedNiches[i,match(niche[[i]],observedNicheNames)]<-1
		}
		row.names(observedNiches)<-eDNANatHist$Taxon ; colnames(observedNiches)<-observedNicheNames

	#assign taxonomic families to observed niches and map back to transect-level data
	Niche_Families=as.data.frame(matrix(NA, nrow(families), ncol(observedNiches))); index=1
		for (i in unique(row.names(families))){
			transect.df=families[i, families[i,]>0]
			Niche_Families[index,]<-colSums(observedNiches[match(names(transect.df), row.names(observedNiches)),])  #just counts each family once, no matter how many OTUs present at a transect
				index=index+1
			}
			
		row.names(Niche_Families)<-row.names(families) ; colnames(Niche_Families)<-names(observedNiches)    #Note that each family can be in more than one niche, because of variation in taxa below the family level.


	#Alpha functional div -- number of unique niches per transect. 

	#number of functional groups goes up with imperviousness...
plot(specnumber(Niche_Families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)  
	summary(lm(specnumber(Niche_Families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))

	#... but this is a function of the overall increase in OTU and taxon richness. Normalizing by OTU or taxon richness reveals a decline in functional diversity
plot(specnumber(Niche_Families)/specnumber(temp[,dna_index])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)  #N niches per OTU
	summary(lm(specnumber(Niche_Families)/specnumber(temp[,dna_index])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #strong decrease with imperviousness
plot(specnumber(Niche_Families)/specnumber(families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)  #N niches per family
	summary(lm(specnumber(Niche_Families)/specnumber(families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #strong decrease with imperviousness


		#dealing with pseudo-replication by using site means in calculations
		NicheFamiliesMeans<-aggregate(specnumber(Niche_Families), list(substr(names(specnumber(Niche_Families)),1,3)), mean)
		NicheFamiliesSums<-specnumber(aggregate(Niche_Families, list(substr(names(specnumber(Niche_Families)),1,3)), mean))
		
		FamilyNumberMeans<-	aggregate(specnumber(families), list(substr(names(specnumber(families)),1,3)), mean)
		FamilyNumberSums<-	specnumber(aggregate(families, list(substr(names(specnumber(families)),1,3)), sum))

			# plot(I(NicheFamiliesMeans[,2]/FamilyNumberMeans[,2])~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
				summary(lm(I(NicheFamiliesMeans[,2]/FamilyNumberMeans[,2])~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)))
				summary(lm(NicheFamiliesMeans[,2]~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)))


			##PLOTTING
#			pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/SecondSubmission/Niches_by_Imperviousness1x2.pdf", width=8, height=3.3)
			par(mfrow=c(1,2))
			par(mar=c(4,5,1,2))
			par(mgp=c(2,1,0))
				plot(NicheFamiliesMeans[,2]~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), pch=19, ylab="Life-History Richness \n(Life-Histories/Site)", xlab="Watershed Imperviousness (Percent)", ylim=c(10,18), col=gg_colors(2)[2])
					abline(lm(NicheFamiliesMeans[,2]~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)), col=gg_colors(1), lwd=2)
					points(specnumber(Niche_Families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, cex=0.5, pch=19, col=gg_colors(2)[2])
					
				
				plot(I(NicheFamiliesMeans[,2]/FamilyNumberMeans[,2])~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), pch=19, ylab="Per-Taxon Life-History Richness \n(Life-Histories/Site/Family)", xlab="Watershed Imperviousness (Percent)", ylim=c(0.2,1), col=gg_colors(2)[2])
					abline(lm(I(NicheFamiliesMeans[,2]/FamilyNumberMeans[,2])~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)), col='grey', lwd=2)
						meanFamsPerSite=NA
						for(i in 1:8){meanFamsPerSite=c(meanFamsPerSite, rep(FamilyNumberMeans[i,2], length(grep(NicheFamiliesMeans[i,1], row.names(Niche_Families)))))} ; meanFamsPerSite= meanFamsPerSite[-1]
					points(I(specnumber(Niche_Families)/meanFamsPerSite)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, cex=0.5, pch=19, col=gg_colors(2)[2])
#			dev.off()


				#splitting into more/less urban categories for wilcox.test or t.test, etc
				more1<-NicheFamiliesMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)>10, 2] #moreUrban
				less1<-NicheFamiliesMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)<10, 2] #lessUrban
					wilcox.test(more1, less1)

				more2<- (NicheFamiliesMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)>10, 2])/(FamilyNumberMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)>10, 2])
				less2<- (NicheFamiliesMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)<10, 2])/(FamilyNumberMeans[unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)<10, 2])


	Niche_Families <- Niche_Families[,order(colSums(Niche_Families), decreasing=T)]  #order cols by frequency of niche	
	orderedNicheFamilies <- Niche_Families[order(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, decreasing=F),]  #order rows by imperviousness, for plotting
	#barplot(t(as.matrix(orderedNicheFamilies[,7:10])), las=2, col=gg_colors(6), legend=T, args.legend=list(x="topleft"), ylab="N Families")
	#barplot(t(as.matrix(orderedNicheFamilies/rowSums(orderedNicheFamilies))), las=2, col=gg_colors(6), legend=F, args.legend=list(x="topleft"), ylab="N Families")

	#plotting these niches by imperviousness, after normalizing by N families present at the site. 
		par(mfrow=c(5,2))
		par(mar=c(1,2,1,1))
		par(mgp=c(1,.5,0))
		for (i in 10:19){
		plot(Niche_Families[,i]/specnumber(families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, main="", pch=19, ylab="") 
			title(names(Niche_Families)[i], line=-7)
		mod=lm(Niche_Families[,i]/specnumber(families)~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness) #; if(summary(mod)$coefficients[8]<0.05) abline(mod, col='red') else abline(mod, col='lightgrey')
		
		
		ydata= Niche_Families[,i]/specnumber(families)
		xdata=cofactor.df$Area.Weighted.Mean.Percent.Imperviousness
		nonlin.mod=nls(ydata~a*(xdata^2)+b*xdata+c, start=list(a=1, b=1, c=1))
			#lines(sort(xdata), predict(nonlin.mod)[order(xdata)])
		
		if(lrtest(mod, nonlin.mod)$Pr[2]<0.05) lines(sort(xdata), predict(nonlin.mod)[order(xdata)]) else if(summary(mod)$coefficients[8]<0.05) abline(mod, col='red') else abline(mod, col='lightgrey')
		}
		
		
	#Beta functional div -- how different are transects within a site, with respect to functional diversity?
		#use presAbs version of Niche_Families, so as to count each unique niche only once?
		NichePresAbs=Niche_Families ; NichePresAbs[NichePresAbs>0]<-1


	NicheDistance=NA; NicheBetaList=list(NA)  #create empty objects for results .  betaList will keep all pairwise interactions ; "distance" will use a parallel calculation for the same idea
		index=1
		for (i in unique(temp$Site.code)){
			site.df= NichePresAbs[temp$Site.code%in%i,]
				NicheBetaList[[index]]<-as.vector(betadiver(site.df, method=1))
				NicheDistance[index]=mean(as.vector(vegdist(site.df, method="jaccard")))
				index=index+1
			}

	NicheBeta.df<-as.data.frame(NicheBetaList) ; names(NicheBeta.df)<-unique(temp$Site.code) ; NicheBeta.df[2:3,3]<-NA  #save as data frame; replace repeated values as NA to reflect fact that a two-sample comparison has only a single beta value (that sample didn't have all three transects amplify)
	
	NicheBeta<-colMeans(NicheBeta.df, na.rm=T)  #keep means as "beta"

	plot(NicheBeta~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
		summary(lm(NicheBeta~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness)))


	#Gamma diversity (site-level)
		#nearly all functional groups occur at all sites at least once; real differences are at the transect level
			specnumber(aggregate(Niche_Families, by=list(substr(row.names(Niche_Families),1,3)), sum))


#more- and less-urban sites sort out well by functional distances as well
pca.niche<-prcomp(Niche_Families)
#			pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/SecondSubmission/PCA_LifeHistory.pdf", width=7, height=7)
			ordiplot(pca.niche, display="sites", main="Family-Level Life-History Categories")
			samplingsites=substr(row.names(Niche_Families),1,3)
				ordihull(pca.niche,groups= samplingsites,draw="polygon",col=gg_colors(8),label=F)
						ordihull(pca.niche,groups= samplingsites,draw="polygon",col= gg_colors(2)[1],label=F, show.groups=unique(samplingsites)[c(1,5,6,7)])
						ordihull(pca.niche,groups= samplingsites,draw="polygon",col= gg_colors(2)[2],label=F, show.groups=unique(samplingsites)[c(2:4,8)])
				orditorp(pca.niche,display="sites")
										legend("topright", c("More Urban", "Less Urban"), fill=gg_colors(2), cex=0.8)
#			dev.off()

			#niche loadings:
			pca.niche[[2]][,c(1:2)]


			# ordiplot(pca.niche, display="sites")
			# urbancategory=cofactor.df$category.x
						# ordihull(pca.niche,groups= urbancategory,draw="polygon",col= gg_colors(2)[1],label=F, show.groups="more urban")
						# ordihull(pca.niche,groups= urbancategory,draw="polygon",col= gg_colors(2)[2],label=F, show.groups="less urban")
						# orditorp(pca.niche,display="sites")
						# legend("topright", c("More Urban", "Less Urban"), fill=gg_colors(2), cex=0.8)


########################################################################
#####Human footprint
########################################################################
	#genera=read.csv("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/AnalysisFinal/allGenus_mod.csv", header=F)
		Imperv=cofactor.df$Area.Weighted.Mean.Percent.Imperviousness
		#look for cultivated species:
		#Modiolus richness is signif correlated w Imperv
		#summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Modiolus",1]])~Imperv, family='quasipoisson'))  #cultivated?
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Panopea",1]])~Imperv, family='poisson'))   #cultivated, but obv also naturally occurring
				
		#Human richness same
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Homo",1]])~ Imperv, family='poisson'))
		
		#putative invasives/introduced/reintroduced
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Mercenaria",1]])~Imperv, family='quasipoisson'))  #marg
		summary(glm(abs(cofactor.df[,genera[genera[,3]=="Nuttallia",1]])~Imperv, family='quasipoisson'))  #marg
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Anguinella",1]])~Imperv, family='quasipoisson'))  #nonsig 
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Mya",1]])~Imperv, family='poisson'))  #highly sig 
		
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Bos",1]])~Imperv, family='quasipoisson'))  #sig 
		summary(glm(specnumber(cofactor.df[,genera[genera[,3]=="Gallus",1]])~Imperv, family='quasipoisson'))  # not sig, but tiny numbers of occurrences.  Canidae, similarly, has only one occur


########################################################################
####NATURAL HISTORY COMPARISON
#using Fams_w_NatHist and manual_FamilyCounts, from above
########################################################################
				
				#note: keeping this for possible future use to compare more/less urban sites, rather than comparing manual and eDNA counts, as here
				table(eDNANatHist$Category.infauna) ; table(ManualNatHist$Category.infauna)
					binom.test(c(39,0), p=23/135)  #are the manual and eDNA taxa drawn from the same distribution, in terms of frequency of infaunal taxa?  no.  use binomial test, with expected probability set by the observed prob in the eDNA taxa


				eDNA_infauna=eDNANatHist$Taxon[eDNANatHist$Category.infauna==1] #infaunal taxa
				eDNA_epifauna=eDNANatHist$Taxon[eDNANatHist$Category.epifauna==1] #epifaunal taxa
				eDNA_motile=eDNANatHist$Taxon[eDNANatHist$Mobility.motile ==1] #motile
				eDNA_sessile=eDNANatHist$Taxon[eDNANatHist$Mobility.sessile ==1] #sessile
				eDNA_fish=eDNANatHist$Taxon[grep("Fish", eDNANatHist $Description)]
				eDNA_inverts=eDNANatHist$Taxon[grep("Invert", eDNANatHist $Description)]
				eDNA_intertidal=eDNANatHist$Taxon[eDNANatHist$Habitat.intertidal ==1] #intertidal taxa
				eDNA_subtidal=eDNANatHist$Taxon[eDNANatHist$Habitat.subtidal ==1] #subtidal taxa
				eDNA_terrestrial=eDNANatHist$Taxon[eDNANatHist$Habitat.terrestrial ==1] #subtidal taxa
				
		summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_intertidal]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  # correlation bw imperviousness and intertidal OTU richness
		summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_subtidal]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  # correlation bw imperviousness and subtidal OTU richness
		summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_terrestrial]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  # no correlation bw imperviousness and terrestrial OTU richness
		
		
		summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in%eDNA_fish]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #no correlation bw imperviousness and fish OTU richness
		summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_inverts]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #pos correlation bw imperviousness and invert OTU richness
		
				
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in%eDNA_infauna]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and infaunal OTU richness
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in%eDNA_epifauna]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and epifaunal OTU richness
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_motile]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and motile OTU richness
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_sessile]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and sessile OTU richness
	
	#sessile OTUs triple
	nh.temp<-data.frame(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in% eDNA_sessile]]), cofactor.df$Area.Weighted.Mean.Percent.Imperviousness); names(nh.temp)=c("sessileOTUs", "Imperv")
	mean(nh.temp$sessileOTUs[nh.temp$Imperv<25])
	mean(nh.temp$sessileOTUs[nh.temp$Imperv>25])
		
		
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in%eDNANatHist$Taxon[eDNANatHist$Category.demersal ==1]]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and demersal OTU richness
				summary(lm(specnumber(cofactor.df[,master$OTU_name_swarm[master$OTU_taxon_family%in%eDNANatHist$Taxon[eDNANatHist$Category.demersal ==0]]])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))  #correlation bw imperviousness and non-demersal OTU richness


########################################################################
#####FOR SUPPLEMENT############
########################################################################
	#with only top 100 OTUs
	summary(lm(specnumber(temp[,names(head(sort(colSums(temp[,dna_index]), decreasing=T), 100))])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
	
	#with only bottom 500 OTUs
	summary(lm(specnumber(temp[,names(head(sort(colSums(temp[,dna_index]), decreasing=F), 500))])~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
	
	#with raw data (no rarefaction or decontamination)
	z=t(OTUs[, PSeelgrassSites_tags]) ; row.names(z)=PSeelgrassSites$sample_name
					row.names(z)=gsub("1A","_1", row.names(z))
					row.names(z)=gsub("2A","_2", row.names(z))
					row.names(z)=gsub("3A","_3", row.names(z))
					row.names(z)=gsub("3B","_3", row.names(z))
	z1=aggregate(z, list(row.names(z)), mean) ; names(z1)[1]="Sample.ID"
	z2=merge(z1, cofactor.df[,c(2,1737)], "Sample.ID")
	summary(lm(specnumber(z2[,2:6198])~z2[,6199]))

	##plot for supplement; data subsets and superset
#pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/Scatter3x1_Supplement_dataSubsets.pdf", height=7 , width=5)
	par(mfrow=c(3,1))
	
		head.df=data.frame(specnumber(temp[,names(head(sort(colSums(temp[,dna_index]), decreasing=T), 100))])); names(head.df)[1]="richness"
		head.means=aggregate(head.df$richness, list(substr(row.names(head.df),1,3)), mean)[,2]
	plot(head.df$richness~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, pch=19, xlab="Watershed Imperviousness (Percent)", ylab="OTU Richness", main="100 Most Common OTUs" , col=gg_colors(2)[2], cex=0.6)
	points(head.means ~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), pch=19, cex=1.5, col=gg_colors(2)[2])
		mod1=lm(head.means ~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
		abline(mod1, col=gg_colors(1))
		text(40, 20, paste0("R^2=",round(summary(mod1)$r.squared,3)," \np=",round(summary(mod1)$coefficients[8], 3)))
	
		tail.df=data.frame(specnumber(temp[,names(head(sort(colSums(temp[,dna_index]), decreasing=F), 500))])) ; names(tail.df)[1]="richness"
		tail.means=aggregate(tail.df$richness, list(substr(row.names(tail.df),1,3)), mean)[,2]
	plot(tail.df$richness~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, pch=19, xlab="Watershed Imperviousness (Percent)", ylab="OTU Richness", main="500 Least Common OTUs", col=gg_colors(2)[2], cex=0.6)
	points(tail.means ~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), pch=19, cex=1.5, col=gg_colors(2)[2])
		mod2=lm(tail.means ~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
		abline(mod2, col=gg_colors(1))
		text(40, 20, paste0("R^2=",round(summary(mod2)$r.squared,3)," \np=",round(summary(mod2)$coefficients[8], 4)))
	
		all.df=data.frame(specnumber(z1[,2:6198])) ; names(all.df)[1]="richness"
		all.means=aggregate(all.df $richness, list(substr(row.names(tail.df),1,3)), mean)[,2]
	plot(all.df$richness~cofactor.df$Area.Weighted.Mean.Percent.Imperviousness, pch=19, xlab="Watershed Imperviousness (Percent)", ylab="OTU Richness", main="Raw OTU Data (N = 6198) \nWithout Normalization or Decontamination", col=gg_colors(2)[2], cex=0.6)
	points(all.means~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness), pch=19, cex=1.5, col=gg_colors(2)[2])
		mod3=lm(all.means~unique(cofactor.df$Area.Weighted.Mean.Percent.Imperviousness))
		abline(mod3, col=gg_colors(1))
		text(40, 150, paste0("R^2=",round(summary(mod3)$r.squared,3)," \np=",round(summary(mod3)$coefficients[8], 4)))
#	dev.off()

	
#####taxon stacked area chart 
		
				#create df for plotting with a subset of data, picking taxon of interest
			forplot=merge(cofactor.df[,c(2, 1742, 1737)], classes1, "Sample.ID") ; row.names(forplot)=row.names(classes1) ; forplot=forplot[,-1]
					forplotMax=aggregate(forplot, list(substr(row.names(forplot),1,3)), max)
						row.names(forplotMax)= forplotMax[,1] ; forplotMax = forplotMax[,-1]
					forplotMax = forplotMax[order(forplotMax$Area.Weighted.Mean.Percent.Imperviousness),]
		#####FIGURES####
#			pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/SecondSubmission/SupplFig_AreaStackedImperviousness.pdf" , width=9.6, height=5.9)
			plot.stacked(forplotMax $Area.Weighted.Mean.Percent.Imperviousness, forplotMax[,3:ncol(forplotMax)][,which(colSums(forplotMax[,3:ncol(forplotMax)])>20)], col= gg_colors(length(which(colSums(forplotMax[,3:ncol(forplotMax)])>20))), ylab="OTU Richness by Class", xlab="Imperviousness (Percent)", main="eDNA OTU Richness by Class")
				abline(v=forplotMax $Area.Weighted.Mean.Percent.Imperviousness, col="lightgrey")
				legend(min(forplotMax $Area.Weighted.Mean.Percent.Imperviousness),350, names(which(colSums(forplotMax[,3:ncol(forplotMax)])>20)), fill= gg_colors(length(which(colSums(forplotMax[,3:ncol(forplotMax)])>20))), cex=0.5, ncol=2, bg="white")
#			dev.off()
			
			forplotMax = forplotMax[order(forplotMax$Population.density),]
#			pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/StackedArea_humanPop_w_vLines.pdf" , width=9.6, height=5.9)
			plot.stacked(forplotMax $Population.density, forplotMax[,3:ncol(forplotMax)][,which(colSums(forplotMax[,3:ncol(forplotMax)])>20)], col= gg_colors(length(which(colSums(forplotMax[,3:ncol(forplotMax)])>20))), ylab="OTU Richness by Class", xlab="Human Population Density", main="eDNA OTU Richness by Class")
				abline(v=forplotMax$Population.density, col='lightgrey')
				legend(min(forplotMax $Population.density),350, names(which(colSums(forplotMax[,3:ncol(forplotMax)])>20)), fill= gg_colors(length(which(colSums(forplotMax[,3:ncol(forplotMax)])>20))), cex=0.5, ncol=2, bg="white")
#			dev.off()


####FIGURE -- multipanel trends; N families per class; POISSON regression
			focalClasses=names(table(fams2classes)[table(fams2classes)>5])
	#pdf("/Users/rpk/GoogleDrive/Kelly_Lab/Projects/PS_urban_eDNA/Figures/FIGURE_NFamilies_Class_20160204.pdf", width=6, height=7)			
			par(mfrow=c(4,2))
			par(mar=c(3,4,2,2))
			par(oma=c(2,2,1,2))
			par(mgp=c(2,1,0))
			#mean richness by imperv
			for (i in 1:(length(focalClasses)-2)){
					df=data.frame(specnumber(fams_temp[, which(fams2classes==focalClasses[i])+1792]), cofactor.df $Area.Weighted.Mean.Percent.Imperviousness) ; names(df)=c("Richness", "Imperv")
					meanRichness=aggregate(df$Richness, list(df$Imperv), mean) ; meanRichness= meanRichness[match(unique(df$Imperv), meanRichness[,1]),2]
					plot(Richness~Imperv, data=df, main= focalClasses[i], col=gg_colors(2)[2], pch=19, ylab="Number of Familes", xlab="", cex=0.6)
					points(meanRichness~unique(df$Imperv), col=gg_colors(2)[2], pch=19, cex=1.5 )
						mod=glm(round(meanRichness)~unique(df$Imperv), data=df, family='quasipoisson')
						if(summary(mod)$coefficients[8]<0.05) 
							lines(sort(data.frame(unique(df$Imperv),predict.glm(mod, type='response'))), col=gg_colors(1))
						if(summary(mod)$coefficients[8]>0.05) 
							lines(sort(data.frame(unique(df$Imperv),predict.glm(mod, type='response'))), col='lightgrey', lty=2)
				}
			for (i in 6:7){
					df=data.frame(specnumber(fams_temp[, which(fams2classes==focalClasses[i])+ 1792]), cofactor.df $Area.Weighted.Mean.Percent.Imperviousness) ; names(df)=c("Richness", "Imperv")
					meanRichness=aggregate(df$Richness, list(df$Imperv), mean) ; meanRichness= meanRichness[match(unique(df$Imperv), meanRichness[,1]),2]
					plot(Richness~Imperv, data=df, main= focalClasses[i], col=gg_colors(2)[2], pch=19, ylab="Number of Familes", xlab="Impervious Surface Cover", cex=0.6)
					points(meanRichness~unique(df$Imperv), col=gg_colors(2)[2], pch=19, cex=1.5)
						mod=glm(round(meanRichness)~unique(df$Imperv), data=df, family='quasipoisson')
						if(summary(mod)$coefficients[8]<0.05) 
							lines(sort(data.frame(unique(df$Imperv),predict.glm(mod, type='response'))), col=gg_colors(1))
						if(summary(mod)$coefficients[8]>0.05) 
							lines(sort(data.frame(unique(df$Imperv),predict.glm(mod, type='response'))), col='lightgrey', lty=2)
				}
	#dev.off()