########################################################
#1) Change in colony strength and bee diseases over time?

mod2 <- lm(Adult ~ Colony + INTSeason , data = diseases)
mod3 <- lm(Adult ~ Colony + INTSeason + Farm:Colony, data = diseases)

#repeat linear regression analysis steps 2-4 for more appropriate model, then 

summary(mod2) 
#or
summary(mod3)

#different way to look at data over time individually 

install.packages("ggpubr")

# Visualize the data: colony strength over the season
# Plot weight by group and color by group

library("ggpubr")
library(ggplot2)

#This is for the before and after comparisons of adult frames 2019

g<- ggboxplot(diseases, x = "Bloom", y = "Adult", 
          color = "Bloom", palette = c("#0C4893", "#0C4893"),
          ylab = "Colony Strength (Adult Frames)", xlab = "Bloom")
g+geom_dotplot(binaxis='Adult', stackdir='center', dotsize=1)+ theme_light()+theme(legend.position = "none", axis.text=element_text(size=24, family = "Times"),
        axis.title.x=element_text(size=28,face="bold", family= "Times"), axis.title.y=element_text(size=28,face="bold", family= "Times"))+ geom_hline(yintercept=6, linetype='dotted', col = 'black')

##2020 diseases

h <- ggboxplot(bees2, x = "Season", y = "Adult", 
              color = "Season", palette = c("#A4CFF7", "#0C4893"),
              ylab = "Colony strength (adult frames)", xlab = "2020 Season")
h+theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"))

 
# Wilcoxon two sample test for colony strength different beginning and end of season 

res <- wilcox.test(Adult ~ Season, data = diseases,
                   exact = FALSE)
res

res1 <- wilcox.test(Adult ~ Season, data = strength,
                    exact = FALSE, alternative = "less")
res1

res2 <- wilcox.test(Adult ~ Season, data = strength,
                    exact = FALSE, alternative = "greater")
res2

# Looks like we fail to reject the null in all three cases, so adult strength didn't change

# change numeric columns to factors with value labels 
diseases$INTSeason <- factor(diseases$INTSeason,levels=c(0,1),
                             labels=c("Early","Late")) 
diseases$EFB.Severity <- factor(diseases$EFB.Severity,levels=c(0,1,2,3),
                                labels=c(0,1,2,3)) 

##Are EFB ratings higher at the end of the season?

res3 <- wilcox.test(EFB.Severity ~ Season, data = diseases,
                    exact = FALSE, alternative = "less")

res3

res2020 <- wilcox.test(EFB.Severity ~ Season, data = diseasesplus2020,
                    exact = FALSE, alternative = "less")

res2020

res4 <- wilcox.test(Diseases ~ Season, data = diseases,
                    exact = FALSE, alternative = "less")

res4

# Results = higher EFB ratings at the end of the season


# Graphs for disease data

library(ggplot2)

##Graph showing number of diseases before and after pollination

b <- ggplot(diseases, aes(x=Bloom, y=Diseases, fill=Farm)) + 
  geom_boxplot() +
  geom_dotplot(binaxis='y', binwidth=.1, stackdir='center', dotsize=.4, position=position_dodge(.75))+
  labs(x ="2019 Bloom ", y = "Number of diseases (per colony)")

b + scale_fill_brewer(palette="Blues") + theme_classic() + 
  theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"))


##EFB severity graphs


#EFB levels, early and late, 2019
d <- ggplot(diseases, aes(x=Bloom, y=EFB.Severity, fill=Farm)) + 
  geom_bar(mapping=aes(y=EFB.Severity, x=Bloom), stat = "identity", position="dodge") +
  geom_dotplot(binaxis='y', stackdir='center', binpositions="all", dotsize=.4, binwidth = 0.1, position=position_dodge(0.9))+
  labs(x="Bloom", y = "EFB Severity Rating (Per Colony)")
d1<- d + scale_fill_brewer(palette="Blues") + theme_classic()+ 
  theme(axis.text.y=element_text(size=30, family = "Times"), axis.text.x=element_text(size=30, family = "Times"), axis.title.y=element_text(size=30,face="bold", family = "Times"), axis.title.x=element_text(size=30,face="bold", family = "Times"), legend.text=element_text(size=20, family = "Times"), legend.title=element_text(size=20, face= "bold", family = "Times"))+ geom_text(x="Early", y=3, label="2019", family = "Times New Roman", size= 16)
d1

#2020 EFB severity
library(plyr)
revalue(bees2$Season, c("Beginning"="Early", "End"="Late"))->bees2$Season
d2020 <- ggplot(bees2, aes(x=Season, y=EFB.Severity, fill=Farm)) + 
  geom_bar(mapping=aes(y=EFB.Severity, x=Season), stat = "identity", position="dodge") +
  geom_dotplot(binaxis='y', stackdir='center', binpositions="all", dotsize=.4, binwidth = 0.1, position=position_dodge(0.9))+
  labs(x="Bloom", y="  ")
d12020 <- d2020 + scale_fill_brewer(palette="Blues") + theme_classic()+theme(legend.position = "none")+
  theme(axis.text.y=element_text(size=30, family = "Times"), axis.text.x=element_text(size=30, family = "Times"), axis.title=element_text(size=30,face="bold", family = "Times"),legend.text=element_text(size=20, family = "Times"), legend.title=element_text(size=20, face= "bold", family = "Times"))+ geom_text(x="Early", y=3, label="2020", family = "Times New Roman", size= 16)
d12020

##Combine both EFB graphs side by side
require(gridExtra)
grid.arrange(d1, d12020, ncol=2)

######

#Graphs using frames converted into sq inches

library(ggplot2)
# Simple scatter plot
sp <- ggplot(data=bees, aes(x=Colony, y=Adult.Area)) + geom_point()

# Change line type and color
sp + geom_hline(yintercept=2700, linetype="dashed", color = "orange")


sp1 <- ggplot(data=bees, aes(x=Colony, y=Brood.Area)) + geom_point()

# Change line type and color
sp1 + geom_hline(yintercept=1620, linetype="dashed", color = "orange")

#t test
t.test(bees$Brood.Area, mu = 1620, alternative = "less", conf.level = .95)


####################################################

# 2) Do variety and/or colony strength improve yield?


# Loading packages and functions

library (ggplot2)
library(jtools)
library(plyr)
library(reshape2)
library(corrplot)
library(beeswarm)
library(PerformanceAnalytics)


#####THIS IS FOR 2019
#
#
#
#

### Load in datasets (for andony)
#berry.data has yield data
berry.data <- read.csv("/Users/andonymelathopoulos/Desktop/berrydata.CSV",header=TRUE)
#visitation.data has records of bee visitation to bushes within fields - a measure of pollinator activity on bushes
visitation.data <- read.csv("/Users/andonymelathopoulos/Desktop/visitation.CSV",header=TRUE)
#apiary data tells how strong colonies were at the different locations
apiary.data <- read.csv("/Users/andonymelathopoulos/Desktop/apiary.CSV",header=TRUE)


###(for kennedy)
##I noticed PanAm is spelled differently in the apiary data from other datasets, so I am going to correct it so the values are consistent with the other dataframes
revalue(newflight$Farm, c("Pan Am"="PanAm"))->newflight$Farm
## Another glitch is in how Liberty is spelled in visitation vs apiary data
revalue(visitation$Variety, c("Liberty "="Liberty"))->visitation$Variety
## Also Eden in berry data has an extra space
revalue(berry$Farm, c("Eden "="Eden"))->berry$Farm


### Summarize data  - the experimental unit in this experiment is assumed to be the field (although
### in the future we may consider using some kind of nested design to capture within field variability 
### although on passing glance this variation seems small). So I want to get data summarized down using
### plyr and then I am going to attempt to merge the three datasets into one for analysis. That will be
### my next step after graphing some of the basic field level data. 

#kennedy: I have files named differently than yours, so that is reflected in this script. i have your original if needed
berry.summary<-ddply(berry, c("Farm","Variety"), summarise,
                     fset = mean(FRUITSET,na.rm=TRUE),
                     canes = mean(X..of.canes,na.rm=TRUE),
                     clstrcane = mean(AVG.clusters.per.cane,na.rm=TRUE),
                     bryclstr = mean(Avg..Berries.per.cluster,na.rm=TRUE),
                     brywt = mean(Avg.berry.weight..g.,na.rm=TRUE))

#Calculate the flowers per acre - because I am curious if some fields have a lot more flowers per acre for the bees
berry.summary$flwrs<-berry.summary$bryclstr*berry.summary$canes*berry.summary$clstrcane/(berry.summary$fset/100)

#For yield I used 2000 bushes per acre and I used a constant of 0.00220462 to convert from g/acre to lbs/acre
berry.summary$yield<-berry.summary$bryclstr*berry.summary$canes*berry.summary$clstrcane*berry.summary$brywt*2000*0.00220462
#let me clear some of the varriables out
berry.summary<-subset(berry.summary, select=c("Farm", "Variety", "fset", "flwrs", "brywt", "yield"))

#Let me move to the vistation data

visitation.summary<-ddply(visitation, c("Farm","Variety", "Crop.Stage"), summarise,
                          HBvisit = mean(Honey.Bee.Visits,na.rm=TRUE),
                          BBvisit = mean(Bumble.Bee.Visits,na.rm=TRUE))

#Next I am going to rotate the data frame so crop stage are turned into columns and make one for HB and one for BB


HBvisits<-dcast(visitation.summary, Farm + Variety ~ Crop.Stage, value.var = "HBvisit")
BBvisits<-dcast(visitation.summary, Farm + Variety ~ Crop.Stage, value.var = "BBvisit")

#Focus on early and full bloom because I assume these are most predictive. I will average across the two dates, but discard where no data was recorded
HBvisits$HBmean<-rowMeans(HBvisits[,c("First Bloom", "Full Bloom","Petal Fall")], na.rm=TRUE)
HBvisits<-subset(HBvisits, select=c("Farm", "Variety", "HBmean"))
BBvisits$BBmean<-rowMeans(BBvisits[,c("First Bloom", "Full Bloom", "Petal Fall")], na.rm=TRUE)
BBvisits<-subset(BBvisits, select=c("Farm", "Variety", "BBmean"))
#join BB and HB data
Bvisit<-merge(HBvisits, BBvisits, by=c("Farm","Variety"))

#moving onto the apiary strength

apiary.summary<-ddply(newflight, c("Farm","Variety"), summarise,
                      Flight = mean(Flight,na.rm=TRUE))

#now merge everything together
temp<-merge(Bvisit, apiary.summary, by=c("Farm","Variety"), all=TRUE)
alldata<-merge(temp, berry.summary, by=c("Farm","Variety"), all=TRUE)

write_csv(alldata, path = "alldata.csv")


### Lets examine some coarse relationships


## for sig tests and p values
resyield1 <- cor.mtest(newcorr2019, conf.level = .95)
resyield2 <- cor.mtest(mtcars, conf.level = .99)

N

#changes font in corrgram to times new roman
par(family="Times")

colnames(alldata.correlation) <-c("HB.visits","BB.visits","Flight","Fruit.set","Flowers","Berry.wt" ,"Yield" )


alldata <- rename(alldata,c('Berry.wt'='Berry.weight'))

##for 2019 correlogram
alldata.correlation<-alldata[, c("HBmean","BBmean","Flight","fset","flwrs","brywt" ,"yield" )]
newcorr2019 <- cor(alldata.correlation, use = "complete.obs", method=c("kendall"))

y <- corrplot.mixed(corr = newcorr2019, upper = "ellipse", lower = "number", tl.col = "black", tl.cex = 1.5, cl.cex = 1.3, number.cex=1.5)


require (GGally)
library (GGally)

#shows scatterplot and distribution for 2019 yield
ggpairs(alldata.correlation)

######################################
#
#
#
#
#

#THIS IS FOR 2020

### Load in datasets 
#berry.data has yield data
berry2020 <- read.csv("/Users/kennedygrant/Downloads/2020berry.CSV",header=TRUE)
#visitation.data has records of bee visitation to bushes within fields - a measure of pollinator activity on bushes
visitation2020 <- read.csv("/Users/kennedygrant/Downloads/2020visitation.CSV",header=TRUE)
#apiary data tells how strong colonies were at the different locations
apiary2020 <- read.csv("/Users/kennedygrant/Downloads/newflight2020.CSV",header=TRUE)


berry2020$AVG.clusters.per.cane <- as.numeric(as.character(berry2020$AVG.clusters.per.cane)) 

berry.summary2020 <-ddply(berry2020, c("Farm","Variety"), summarise,
                          fset = mean(FRUIT.SET,na.rm=TRUE),
                          canes = mean(X..of.canes,na.rm=TRUE),
                          clstrcane = mean(AVG.clusters.per.cane,na.rm=TRUE),
                          bryclstr = mean(Avg..Berries.per.cluster,na.rm=TRUE),
                          brywt = mean(Avg.berry.weight..g.,na.rm=TRUE))

#Calculate the flowers per acre - because I am curious if some fields have a lot more flowers per acre for the bees
berry.summary2020$flwrs<-berry.summary2020$bryclstr*berry.summary2020$canes*berry.summary2020$clstrcane/(berry.summary2020$fset/100)

#For yield I used 2000 bushes per acre and I used a constant of 0.00220462 to convert from g/acre to lbs/acre
berry.summary2020$yield<-berry.summary2020$bryclstr*berry.summary2020$canes*berry.summary2020$clstrcane*berry.summary2020$brywt*2000*0.00220462
#let me clear some of the variables out
berry.summary2020<-subset(berry.summary2020, select=c("Farm", "Variety", "fset", "flwrs", "brywt", "yield"))

#Let me move to the vistation data


visitation.summary2020<-ddply(visitation2020, c("Farm","Variety", "Crop.Stage"), summarise,
                              HBvisit = mean(Honey.Bee.Visits,na.rm=TRUE),
                              BBvisit = mean(Bumble.Bee.Visits,na.rm=TRUE))

#Next I am going to rotate the data frame so crop stage are turned into columns and make one for HB and one for BB


HBvisits<-dcast(visitation.summary2020, Farm + Variety ~ Crop.Stage, value.var = "HBvisit")
BBvisits<-dcast(visitation.summary2020, Farm + Variety ~ Crop.Stage, value.var = "BBvisit")

#Focus on early and full bloom because I assume these are most predictive. I will average across the two dates, but discard where no data was recorded
HBvisits$HBmean<-rowMeans(HBvisits[,c("First Bloom", "Full Bloom","Petal Fall")], na.rm=TRUE)
HBvisits<-subset(HBvisits, select=c("Farm", "Variety", "HBmean"))
BBvisits$BBmean<-rowMeans(BBvisits[,c("First Bloom", "Full Bloom", "Petal Fall")], na.rm=TRUE)
BBvisits<-subset(BBvisits, select=c("Farm", "Variety", "BBmean"))
#join BB and HB data
Bvisit<-merge(HBvisits, BBvisits, by=c("Farm","Variety"))

#moving onto the apiary strength


apiary.summary2020<-ddply(apiary2020, c("Farm","Variety"), summarise,
                          Flight = mean(Flight,na.rm=TRUE))

#now merge everything together
temp2020<-merge(Bvisit, apiary.summary2020, by=c("Farm","Variety"), all=TRUE)
alldata2020<-merge(temp2020, berry.summary2020, by=c("Farm","Variety"), all=TRUE)
write_csv(alldata2020, path = "alldata2020.csv")

### Lets examine some coarse relationships

##preliminary correlation chart
alldata2020.correlationnocl<-alldata2020[, c("HB.visits","BB.visits","Flight","Fruit.set","Flowers","Berry.wt" ,"Yield" )]
chart.Correlation(alldata2020.correlation, histogram=TRUE, pch=19)


#changes font in corrgram to times new roman
par(family="Times")

library(dplyr)

colnames(alldata2020.correlationnocl) <-c("HB.visits","BB.visits","Flight","Fruit.set","Flowers","Berry.wt" ,"Yield" )

#name of berry weight was too long
alldata2020.correlationnocl <- rename(alldata2020.correlationnocl,c('Berry.wt'='Berry.weight'))

# 2020 corrgram for yield
new2020yieldcorr <- cor(alldata2020.correlationnocl, use = "complete.obs", method=c("kendall")) # notice how we are ignoring missing data with the last argument
# plot the correlation matrix
x <- corrplot.mixed(corr = new2020yieldcorr, upper = "ellipse", lower = "number", tl.col = "black", tl.cex = 1.5, cl.cex = 1.3, number.cex=1.5)


par(mfrow=c(1,2))


corrplot.mixed(corr = newcorr2019, upper = "ellipse", lower = "number", tl.col = "black", tl.cex = 1.3, cl.cex = 1.3, number.cex=1.5)
corrplot.mixed(corr = new2020yieldcorr, upper = "ellipse", lower = "number", tl.col = "black", tl.cex = 1.3, cl.cex = 1.3, number.cex=1.5)

##################Plotting yield on flight

##Is variety or year necessary in yield graph?

lma <- lm(Yield ~ Flight + Variety + Year, data = alldatabothyears)
summary(lma)

####neither are significant

##this summary also gives quartiles for yield
lmaa <- lm(Flight ~ Yield, data = alldatabothyears)
summary(lmaa)


quantile(as.numeric(alldatabothyears$Flight), na.rm=TRUE)


  ##Yield linear graph, one line, both years, variety denoted in legend

ggplot(alldatabothyears, aes(x = Flight, y = Yield)) +
  geom_point(aes(colour = Variety, shape=Variety), size=4) +
  geom_smooth(method = "lm", col= "black")+labs( x="Incoming Flight (bees/minute)", y="Yield Estimate (kg/ha)")+ theme_light()+theme(text = element_text(family = "Times", size = 20), axis.text=element_text(size=18, family = "Times"), axis.title.x = element_text(size=20, face = "bold"), axis.title.y = element_text(size=20, face = "bold"), legend.title=element_text(face= "bold", size = 18))+ geom_vline(xintercept=38, linetype='dotted', col = 'black')+ scale_color_manual(values=c('#A4CFF7', '#25619A'))

# in 2019 there was a strong relationship with the number of flowers and yield. 
#but also the lack of a relationship between HBmeans and fset and yield (although a strong relationship between BBmean and fset)
#lets look at fset and 

apiary.yield.fit<-lm(brywt~Flight, data=alldata)
summ(apiary.yield.fit)

apiary.yield.fit<-lm(yield~Flight, data=alldata)
summ(apiary.yield.fit)

apiary.fset.fit<-lm(fset~Flight, data=alldata)
summ(apiary.fset.fit)

#lets look at the residuals
pred <- fitted(apiary.yield.fit)
res <- residuals(apiary.yield.fit)
plot(pred, res, 
     pch =  19  ,
     col = " orange  ",
     main = "Residual Plot: Residuals vs. Predicted Values",
     xlab = "Predicted Value", ylab = "Residuals")
abline(h=0)

qqnorm(residuals(apiary.yield.fit), 
       pch = 19  ,
       col = " orange ",
       main = "Normal Probability Plot of Residuals")
qqline(residuals(apiary.yield.fit))


#####################################

# 3) Can the adult population of bees be predicted by flight, cluster, QR/QLESS? 

# Step 1:

# merge two data frames by two factors (2019)

total <- merge(flight,bees,by=c("Colony","Bloom"))

pairs(~ Adult + Brood + Flight + Flight*Queen,
      data = total, pch = 19  , lower.panel = NULL,main    = "Colony strength")

cor(total$Brood, total$Adult)
cor(total$Flight, total$Adult)
cor(total$Queen, total$Adult)

## AIC for flight and temp


lm1 <- lm(Adult ~ Flight, data = total)

AIC(lm1)
stopifnot(all.equal(AIC(lm1),
                    AIC(logLik(lm1))))
BIC(lm1)

lm2 <- lm(Adult ~ Flight + Flight.Temp, data = total)

AIC(lm1, lm2)
BIC(lm1, lm2)

#temperature has a slightly better fit according to AIC

summary(lm1)
summary(lm2)
# summary indiciates that the addition of flight in lm2 has a p val of greater than .05, I chose not to use it for the residuals

####RESIDUALS 

# Step 2: 

pred <- fitted(lm1)
res <- residuals(lm1)

plot(pred, res, 
     pch =  19  ,
     col = " orange  ",
     main = "Residual Plot: Residuals vs. Predicted Values",
     xlab = "Predicted Value", ylab = "Residuals")
abline(h=0)

# Optional: normal qq plot 

qqnorm(residuals(lm1), 
       pch = 19  ,
       col = " orange ",
       main = "Normal Probability Plot of Residuals")
qqline(residuals(lm1))


## GRAPH: Adult and flight linear regression  with temperature noted on scatterplot

library(ggplot2)


##2019 version 
lm3 <- ggplot(total,aes(y=Adult,x=Flight,fill=Flight.Temp, color=Flight.Temp))
lm3 +geom_point()+stat_smooth(method="lm",se=TRUE)+labs(fill = "Temperature (F)", x="Incoming Flight(bees/minute)", y="Frames of Adult Bees", title = "2019")+theme(
  plot.title = element_text(size=16, face="bold"),
  axis.title.x = element_text(size=12),
  axis.title.y = element_text(size=12))

lm3 +geom_point()+stat_smooth(method="lm",se=TRUE)+labs(fill = "Temperature (F)", x="Incoming Flight (bees per minute)", y="Frames of Adult Bees", title = "2019")+theme(plot.title = element_text(size=20, face="bold"), axis.title.x = element_text(size=12),axis.title.y = element_text(size=12), axis.text = element_text(size = 20))+ scale_x_continuous(name="Incoming Flight (bees per minute)", limits=c(10, 146)) +
  scale_y_continuous(name="Frames of Adult Bees", limits=c(0, 20))

#Fun interactive multi line graph depending on temp

ggPredict(lm2,interactive=TRUE)

##2020 data

total2020 <- merge(flight2noav,bees2,by=c("Colony","Bloom"))

lm4 <- ggplot(total2020,aes(y=Adult,x=Flight..bees.per.minute.,fill=Flight.Temp, color=Flight.Temp))
lm4 +geom_point()+stat_smooth(method="lm",se=TRUE)+labs(fill = "Temperature (F)", x="Incoming Flight (bees per minute)", y="Frames of Adult Bees")+theme( axis.title.x = element_text(size=12),axis.title.y = element_text(size=12), axis.text = element_text(size = 20))

###Do you need year in flight graph?

bothyearflight$Year <- factor(bothyearflight$Year,levels=c(2019,2020),
                              labels=c("2019","2020")) 

lmb <- lm(Adult ~ Flight, data = bothyearflight)
summary(lmb)
lmc <- lm(Adult ~ Flight + Year, data = bothyearflight)
summary(lmc)
lmd <- lm(Adult ~ Flight + Flight.Temp, data = bothyearflight)
lme <- lm(Adult ~ Flight*Year, data = bothyearflight)
summary(lme)

# Compare to more complex models
anova(lmb, lmc)
anova(lmb, lmd)

###flight temp not stat significant, but year is

anova(lmb, lme)

###interaction between flight and year also significant

lmf <- lm(Adult ~ Flight + Year + Flight*Year + Flight.Temp, data = bothyearflight)
summary(lmf)

anova(lmb, lmf) 

###however, when all togehter, both are insignificant, and were thus removed

#change flight temp to metric
bothyearflight <- transform(bothyearflight, Flight.Temp.C = (Flight.Temp-32)*(9/5))


# Final model for both years

lmboth <- ggplot(bothyearflight,aes(y=Adult,x=Flight, fill = Flight.Temp.C))+geom_point(aes(color=Flight.Temp.C))
lmboth +guides(color=FALSE)+stat_smooth(method="lm",se=TRUE, col="black")+labs(fill = "Temperature (°C)", x="Incoming Flight (bees/minute)", y="Frames of Adult Bees")+ theme_light()+ theme(text = element_text(family = "Times", size = 20), axis.title.x = element_text(size=20, family="Times", face = "bold"),axis.title.y = element_text(size=20, family = "Times", face = "bold"), axis.text = element_text(size = 20, family = "Times"), legend.title=element_text(face= "bold", size = 16))+ geom_hline(yintercept=6, linetype='dotted', col = 'black')


##getting the lm equation
lmb <- lm(Adult ~ Flight, data = bothyearflight)
summary(lmb)

##dowloaded both csvs to combine them with a new column indicating year
#we ended up removing it from the graph but this is how i made the current csv titled bothyearflight

library("readr")

write_csv(total, path = "total.csv")
write_csv(total2020, path = "total2020.csv")

bothyearflight$Year <- factor(bothyearflight$Year,levels=c(2019,2020),
                             labels=c("2019","2020")) 

########### Preliminary and final correlation chart for 2019 colony strength info

# Merge two data frames by Colony and Season to match flight up with correct day/hive
total <- merge(flight, bees,by=c("Colony","Bloom"))

# Install PerformanceAnalytics:
install.packages("PerformanceAnalytics")

# Use chart.Correlation():
library("PerformanceAnalytics")
#Column numbers represent Adult, Brood, Cluster, and Flight

##shows distribution and linearity
#2019 data
my_data <- total[, c(9,15,16,17)]
chart.Correlation(my_data, histogram=TRUE, pch=19)

#2020 data
my_data2020 <- total2020[, c(10,14,15,16)]
chart.Correlation(my_data2020, histogram=TRUE, pch=19)

#yield 2019 data (retired)
table(Yield$all)
ls()
my_alldata <- all[, c(5,6,7,8)]
all$Yield <- as.numeric(as.character(all$Yield))

chart.Correlation(my_alldata, histogram=TRUE, pch=19)


###This is the more advanced corrgram script
###Final corrgram for 2019 strength data
##
#added change to method, now kendall

library(corrgram)
library(ellipse)
library(RColorBrewer)

col_order <- c("Adult", "Brood", "Cluster", "Flight")
my_data2 <- my_data[, col_order]
corr2019 <- cor(my_data2, use = "complete.obs", method=c("kendall"))
corrplot.mixed(corr = corr2019, upper = "ellipse", lower = "number", tl.col = "black", tl.cex = 1.5, cl.cex = 1.3, number.cex=1.5)

ggpairs(my_data2)
##checking assumptions for kendall
chart.Correlation(my_data2, histogram=TRUE, pch=19)

####other corrgram options, none used in final
##
#

R = cor(total[15,16,17,9])
round(R, 3)

my_data <- total[, c(15,16,17,9)]
res <- cor(my_data)
round(res, 2)

corrgram(my_data)
plotcorr(res, col = colorRampPalette(c("#E08214", "white", "#8073AC"))(10), type = "lower")

# Used to build a Pannel of 100 colors with Rcolor Brewer
my_colors <- brewer.pal(9, "Blues")
my_colors <- colorRampPalette(my_colors)(100)

# Order the correlation matrix
ord <- order(res[1, ])
data_ord <- res[ord, ord]
plotcorr(res , col=my_colors[data_ord*50+50] , mar=c(1,1,1,1), type = "lower"  )


install.packages("Hmisc")
##Use rcorr() function
install.packages("Hmisc")
library("Hmisc")

res2 <- rcorr(as.matrix(my_data))
res2

corrplot(res2$r, type="upper", order="hclust", 
         p.mat = res2$P, sig.level = 0.01, insig = "blank")

######################

####Don't need this anymore

##Used this to check out cluster vs adult for 2020 data to find anomalies
install.packages("ggrepel")
require(ggrepel)

p3 <- ggplot(bees2[which(bees2$Cluster>0),], aes(x=Adult, y=Cluster)) +
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE)
p3 +geom_text(aes(label=Colony), size=7, colour="blue")


p4<- ggplot(flightminusangelvalley, aes(x=Flight, y=Adult)) +
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE)
p4 +geom_text(aes(label=Colony), size=7, colour="blue")

lmtemp <- lm(Adult ~ Flight, data = flightminusangelvalley)
summary(lmtemp)