library(tidyverse) library(dplyr) library(coin) feet <- read_csv('feetKA.csv') # Select Magellanic penguin adults. feetPema <- subset(feet,Species=="PEMA" & AgeCode==2) table(feetPema$nFootRecsAdult2) # 1 2 3 4 5 6 # 332 242 105 76 10 12 table(feetPema$Sex) # F M # 84 619 # sex unknown = 74 table(feetPema$AgeClass) # 2 3 4 # 194 427 156 ######################################################## ####### Do this whole thing 1000 times ######### ######################################################## chi2tab <- matrix(data=NA,nrow=1000,ncol=3) for (i in 1:1000) { # Divide into dataframes with 1, 2, 3, 4, 5, & 6 rows per # penguin. feet1rec <- subset(feetPema,nFootRecsAdult2 == 1) feet2rec <- subset(feetPema,nFootRecsAdult2 == 2) feet3rec <- subset(feetPema,nFootRecsAdult2 == 3) feet4rec <- subset(feetPema,nFootRecsAdult2 == 4) feet5rec <- subset(feetPema,nFootRecsAdult2 == 5) feet6rec <- subset(feetPema,nFootRecsAdult2 == 6) # Select 1 row at random for each penguin, so data are # independent both within and between the datasets. # Penguins with 2 adult foot-color records (121 penguins): # sample(1:2,121,...) randomly selects 1 or 2 121 times. # Create a vector of numbers 1 to 121 (number the penguins # from 1 to 121). # Multiply the row numbers for the 2s by 2. # Multiply the row numbers for the 1s by 2 & subtract 1. # This gives the subscripts of the rows that will be # kept (selected). seq2 <- sample(1:2,121,replace=TRUE) seq2a <- c(seq(1,121)) seq2b <- c(2*seq2a[seq2==2]) seq2c <- c(2*seq2a[seq2==1]-1) feet2rec <- feet2rec[c(seq2b,seq2c),] # Penguins with 3 adult foot-color records (35): seq3 <- sample(1:3,35,replace=TRUE) seq3a <- c(seq(1,35)) seq3b <- c(3*seq3a[seq3==3]) seq3c <- c(3*seq3a[seq3==2]-1) seq3d <- c(3*seq3a[seq3==1]-2) feet3rec <- feet3rec[c(seq3b,seq3c,seq3d),] # Penguins with 4 adult foot-color records (19): seq4 <- sample(1:4,19,replace=TRUE) seq4a <- c(seq(1,19)) seq4b <- c(4*seq4a[seq4==4]) seq4c <- c(4*seq4a[seq4==3]-1) seq4d <- c(4*seq4a[seq4==2]-2) seq4e <- c(4*seq4a[seq4==1]-3) feet4rec <- feet4rec[c(seq4b,seq4c,seq4d,seq4e),] # Penguins with 5 adult foot-color records (2): seq5 <- sample(1:5,2,replace=TRUE) seq5a <- c(seq(1,2)) seq5b <- c(5*seq5a[seq5==5]) seq5c <- c(5*seq5a[seq5==4]-1) seq5d <- c(5*seq5a[seq5==3]-2) seq5e <- c(5*seq5a[seq5==2]-3) seq5f <- c(5*seq5a[seq5==1]-4) feet5rec <- feet5rec[c(seq5b,seq5c,seq5d,seq5e,seq5f),] # Penguins with 6 adult foot-color records (2): seq6 <- sample(1:6,2,replace=TRUE) seq6a <- c(seq(1,2)) seq6b <- c(6*seq6a[seq6==6]) seq6c <- c(6*seq6a[seq6==5]-1) seq6d <- c(6*seq6a[seq6==4]-2) seq6e <- c(6*seq6a[seq6==3]-3) seq6f <- c(6*seq6a[seq6==2]-4) seq6g <- c(6*seq6a[seq6==1]-5) feet6rec <- feet6rec[c(seq6b,seq6c,seq6d,seq6e, seq6f,seq6g),] # Put the dataframes back together. feetComb <- rbind(feet1rec,feet2rec,feet3rec,feet4rec, feet5rec,feet6rec) # 341 Uniform random numbers from 1 to 511. # 2/3 for model, 1/3 to test. # ModTest 1 = test dataset # ModTest 0 = model dataset seq1 <- sample(1:511, 341, replace=FALSE) feetComb$ModTest <- 1 feetComb$ModTest[seq1] <- 0 # Split into 2 tables: model (2/3) & test (1/3). feetMod <- subset(feetComb,ModTest==0) feetTest <- subset(feetComb,ModTest==1) ############### Model data set ################## mFootColor <- table(feetMod$FootColor4) mAgeClass <- table(feetMod$AgeClass) mAgeColor <- table(feetMod$AgeClass,feetMod$FootColor4) # Calculate % of age classes for each foot color. mAgeColPct <- mAgeColor mAgeColPct[1,] <- mAgeColPct[1,] / mFootColor mAgeColPct[2,] <- mAgeColPct[2,] / mFootColor mAgeColPct[3,] <- mAgeColPct[3,] / mFootColor ############### Test data set ################### tFootColor <- table(feetTest$FootColor4) tAgeClass <- table(feetTest$AgeClass) tAgeColor <- table(feetTest$AgeClass,feetTest$FootColor4) # Sometimes there are no WHI feet in 1 or both data sets. # Check sizes of tables. # nrow gives the number of columns (foot colors). ntColors <- nrow(tFootColor) nmColors <- nrow(mFootColor) # Apply percentages to foot colors = predicted. # If one table is missing WHI, only use the other 3 # columns. pAgeColPct <- mAgeColPct if (ntColors == 4 & nmColors == 4) { pAgeColPct[1,] <- tFootColor * pAgeColPct[1,] pAgeColPct[2,] <- tFootColor * pAgeColPct[2,] pAgeColPct[3,] <- tFootColor * pAgeColPct[3,] } else if (ntColors == 3 & nmColors == 4) { pAgeColPct[1,1:3] <- tFootColor * pAgeColPct[1,1:3] pAgeColPct[2,1:3] <- tFootColor * pAgeColPct[2,1:3] pAgeColPct[3,1:3] <- tFootColor * pAgeColPct[3,1:3] pAgeColPct[,4] <- 0 } else if (ntColors == 4 & nmColors == 3) { pAgeColPct[1,] <- tFootColor[1:3] * pAgeColPct[1,] pAgeColPct[2,] <- tFootColor[1:3] * pAgeColPct[2,] pAgeColPct[3,] <- tFootColor[1:3] * pAgeColPct[3,] } else { pAgeColPct[1,] <- tFootColor * pAgeColPct[1,] pAgeColPct[2,] <- tFootColor * pAgeColPct[2,] pAgeColPct[3,] <- tFootColor * pAgeColPct[3,] } ############## Compare model & test ################ # We want to know if the predicted distribution of # young, middle-aged, and old birds in the test data # set differs from the actual age distribution. x=matrix(data=c(tAgeClass,sum(pAgeColPct[1,]), sum(pAgeColPct[2,]),sum(pAgeColPct[3,])), nrow=3,ncol=2) chi2x <- chisq.test(x) chi2tab[i,1] <- chi2x$statistic chi2tab[i,2] <- chi2x$parameter chi2tab[i,3] <- chi2x$p.value } nLE05 <- length(chi2tab[chi2tab[,3]<=0.05,3]) # 11 out of 1000 runs produced p <= 0.05 (1.1%) # 989 runs p > 0.05. summary(chi2tab[,3]) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.006762 0.434890 0.638285 0.616438 0.822019 0.999501 chi2tab <- as.data.frame(chi2tab) %>% rename(chi2Stat=V1,df=V2,pVal=V3) # Save the results. write.csv(chi2tab,'chi2tab.csv') # Histogram of chi2 p values. ggplot() + geom_histogram(aes(x=chi2tab[, 3]),binwidth = 0.05, center=0.025,lwd=1, color="black",fill="white") + geom_vline(xintercept=0.05,linetype="dashed",lwd=1) + theme_classic() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), text=element_text(size=48)) + scale_y_continuous(limits=c(0,85),expand=c(0,0)) + annotate("text",x=0.07,y=60,label="0.05",size=12, angle=-90) + ylab("Number of iterations") + xlab(expression(italic(chi)^2~ p-value)) ############################################################### # Read saved Chi2 table back in. It has row numbers, which # gets read in as X1, the 1st column. Make it a dataframe # and drop the 1st column. # Then run the above ggplot(). chi2tab <- read_csv('c:/users/ginger/papers/feet/data/chi2tab.csv') chi2tab <- as.data.frame(chi2tab[,2:4]) dev.print(png, file="c:/users/ginger/papers/feet/figures/Figure6.png", width=6000,height=4500,units='px',res=300) dev.off()