library(tidyverse)
library(dplyr)
library(coin)

feet <- read_csv('feetKA.csv')

# Select Magellanic penguin adults.

feetPema <- subset(feet,Species=="PEMA" & AgeCode==2)

table(feetPema$nFootRecsAdult2)
#   1   2   3   4   5   6 
# 332 242 105  76  10  12

table(feetPema$Sex)
#  F   M 
# 84 619 
# sex unknown = 74

table(feetPema$AgeClass)
#   2   3   4 
# 194 427 156 

########################################################
####### Do this whole thing 1000 times #########
########################################################

chi2tab <- matrix(data=NA,nrow=1000,ncol=3)

for (i in 1:1000) {
# Divide into dataframes with 1, 2, 3, 4, 5, & 6 rows per
# penguin.

feet1rec <- subset(feetPema,nFootRecsAdult2 == 1)
feet2rec <- subset(feetPema,nFootRecsAdult2 == 2)
feet3rec <- subset(feetPema,nFootRecsAdult2 == 3)
feet4rec <- subset(feetPema,nFootRecsAdult2 == 4)
feet5rec <- subset(feetPema,nFootRecsAdult2 == 5)
feet6rec <- subset(feetPema,nFootRecsAdult2 == 6)

# Select 1 row at random for each penguin, so data are
# independent both within and between the datasets.

# Penguins with 2 adult foot-color records (121 penguins): 
# sample(1:2,121,...) randomly selects 1 or 2 121 times.
# Create a vector of numbers 1 to 121 (number the penguins
# from 1 to 121).
# Multiply the row numbers for the 2s by 2. 
# Multiply the row numbers for the 1s by 2 & subtract 1.
# This gives the subscripts of the rows that will be 
# kept (selected).

seq2 <- sample(1:2,121,replace=TRUE)
seq2a <- c(seq(1,121))
seq2b <- c(2*seq2a[seq2==2])
seq2c <- c(2*seq2a[seq2==1]-1)
feet2rec <- feet2rec[c(seq2b,seq2c),]

# Penguins with 3 adult foot-color records (35): 

seq3 <- sample(1:3,35,replace=TRUE)
seq3a <- c(seq(1,35))
seq3b <- c(3*seq3a[seq3==3])
seq3c <- c(3*seq3a[seq3==2]-1)
seq3d <- c(3*seq3a[seq3==1]-2)
feet3rec <- feet3rec[c(seq3b,seq3c,seq3d),]

# Penguins with 4 adult foot-color records (19): 

seq4 <- sample(1:4,19,replace=TRUE)
seq4a <- c(seq(1,19))
seq4b <- c(4*seq4a[seq4==4])
seq4c <- c(4*seq4a[seq4==3]-1)
seq4d <- c(4*seq4a[seq4==2]-2)
seq4e <- c(4*seq4a[seq4==1]-3)
feet4rec <- feet4rec[c(seq4b,seq4c,seq4d,seq4e),]

# Penguins with 5 adult foot-color records (2):
seq5 <- sample(1:5,2,replace=TRUE)
seq5a <- c(seq(1,2))
seq5b <- c(5*seq5a[seq5==5])
seq5c <- c(5*seq5a[seq5==4]-1)
seq5d <- c(5*seq5a[seq5==3]-2)
seq5e <- c(5*seq5a[seq5==2]-3)
seq5f <- c(5*seq5a[seq5==1]-4)
feet5rec <- feet5rec[c(seq5b,seq5c,seq5d,seq5e,seq5f),]

# Penguins with 6 adult foot-color records (2):
seq6 <- sample(1:6,2,replace=TRUE)
seq6a <- c(seq(1,2))
seq6b <- c(6*seq6a[seq6==6])
seq6c <- c(6*seq6a[seq6==5]-1)
seq6d <- c(6*seq6a[seq6==4]-2)
seq6e <- c(6*seq6a[seq6==3]-3)
seq6f <- c(6*seq6a[seq6==2]-4)
seq6g <- c(6*seq6a[seq6==1]-5)
feet6rec <- feet6rec[c(seq6b,seq6c,seq6d,seq6e,
                       seq6f,seq6g),]

# Put the dataframes back together.

feetComb <- rbind(feet1rec,feet2rec,feet3rec,feet4rec,
                 feet5rec,feet6rec)

# 341 Uniform random numbers from 1 to 511.
#    2/3 for model, 1/3 to test.
#    ModTest 1 = test dataset
#    ModTest 0 = model dataset

seq1 <- sample(1:511, 341, replace=FALSE)
feetComb$ModTest <- 1
feetComb$ModTest[seq1] <- 0

# Split into 2 tables: model (2/3) & test (1/3).

feetMod <- subset(feetComb,ModTest==0)
feetTest <- subset(feetComb,ModTest==1)

############### Model data set ##################

mFootColor <- table(feetMod$FootColor4)
mAgeClass <- table(feetMod$AgeClass)
mAgeColor <- table(feetMod$AgeClass,feetMod$FootColor4)

# Calculate % of age classes for each foot color.

mAgeColPct <- mAgeColor
mAgeColPct[1,] <- mAgeColPct[1,] / mFootColor
mAgeColPct[2,] <- mAgeColPct[2,] / mFootColor
mAgeColPct[3,] <- mAgeColPct[3,] / mFootColor

############### Test data set ###################

tFootColor <- table(feetTest$FootColor4)
tAgeClass <- table(feetTest$AgeClass)
tAgeColor <- table(feetTest$AgeClass,feetTest$FootColor4)

# Sometimes there are no WHI feet in 1 or both data sets.
# Check sizes of tables.
# nrow gives the number of columns (foot colors).

ntColors <- nrow(tFootColor)
nmColors <- nrow(mFootColor)

# Apply percentages to foot colors = predicted.
# If one table is missing WHI, only use the other 3
# columns.

pAgeColPct <- mAgeColPct

if (ntColors == 4 & nmColors == 4) {
  pAgeColPct[1,] <- tFootColor * pAgeColPct[1,]
  pAgeColPct[2,] <- tFootColor * pAgeColPct[2,]
  pAgeColPct[3,] <- tFootColor * pAgeColPct[3,]
} else if (ntColors == 3 & nmColors == 4) {
  pAgeColPct[1,1:3] <- tFootColor * pAgeColPct[1,1:3]
  pAgeColPct[2,1:3] <- tFootColor * pAgeColPct[2,1:3]
  pAgeColPct[3,1:3] <- tFootColor * pAgeColPct[3,1:3]
  pAgeColPct[,4] <- 0
} else if (ntColors == 4 & nmColors == 3) {
  pAgeColPct[1,] <- tFootColor[1:3] * pAgeColPct[1,]
  pAgeColPct[2,] <- tFootColor[1:3] * pAgeColPct[2,]
  pAgeColPct[3,] <- tFootColor[1:3] * pAgeColPct[3,]
} else {
  pAgeColPct[1,] <- tFootColor * pAgeColPct[1,]
  pAgeColPct[2,] <- tFootColor * pAgeColPct[2,]
  pAgeColPct[3,] <- tFootColor * pAgeColPct[3,]
}

############## Compare model & test ################
# We want to know if the predicted distribution of 
# young, middle-aged, and old birds in the test data
# set differs from the actual age distribution.

x=matrix(data=c(tAgeClass,sum(pAgeColPct[1,]),
                sum(pAgeColPct[2,]),sum(pAgeColPct[3,])),
         nrow=3,ncol=2)

chi2x <- chisq.test(x)
chi2tab[i,1] <- chi2x$statistic
chi2tab[i,2] <- chi2x$parameter
chi2tab[i,3] <- chi2x$p.value

}

nLE05 <- length(chi2tab[chi2tab[,3]<=0.05,3])
# 11 out of 1000 runs produced p <= 0.05 (1.1%)
# 989 runs p > 0.05.

summary(chi2tab[,3])
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# 0.006762 0.434890 0.638285 0.616438 0.822019 0.999501 

chi2tab <- as.data.frame(chi2tab) %>%
  rename(chi2Stat=V1,df=V2,pVal=V3)

# Save the results.
write.csv(chi2tab,'chi2tab.csv')

# Histogram of chi2 p values.
ggplot() +
  geom_histogram(aes(x=chi2tab[, 3]),binwidth = 0.05,
                 center=0.025,lwd=1,
                 color="black",fill="white") +
  geom_vline(xintercept=0.05,linetype="dashed",lwd=1) +
  theme_classic() + 
  theme(panel.grid.major=element_blank(), 
        panel.grid.minor=element_blank(),
        text=element_text(size=48)) +
  scale_y_continuous(limits=c(0,85),expand=c(0,0)) +
  annotate("text",x=0.07,y=60,label="0.05",size=12,
           angle=-90) +
  ylab("Number of iterations") +
  xlab(expression(italic(chi)^2~ p-value))

###############################################################
# Read saved Chi2 table back in. It has row numbers, which
# gets read in as X1, the 1st column. Make it a dataframe
# and drop the 1st column.
# Then run the above ggplot().

chi2tab <- read_csv('c:/users/ginger/papers/feet/data/chi2tab.csv')
chi2tab <- as.data.frame(chi2tab[,2:4])

dev.print(png,
          file="c:/users/ginger/papers/feet/figures/Figure6.png",
          width=6000,height=4500,units='px',res=300)
dev.off()