library(ape) library(microbenchmark) library(pbapply) library(pbmcapply) library(treedata.table) library(geiger) library(nlme) library(dplyr) library(treeplyr) library(ggplot2) library(ggpubr) '%nin%' <- Negate('%in%') nTips_simTrees=5000 ndiscrete=20 ncont=20 ######################################################## ######################################################## ###############Functions for testing#################### ######################################################## ######################################################## ######################################################## ####Matching and performing operations in############### ############tree/data objects########################### ######################################################## test_TDT<-function(nTips_simTrees, ndiscrete, ncont, unmatched){ ##Create data Stree<-rtree(nTips_simTrees) Discrete_chars<-as.data.frame(sapply(1:ndiscrete, function(x) sample(c("A","B","C"), nTips_simTrees,replace = T))) Cont_chars<-as.data.frame(sapply(1:ncont, function(x) rnorm(nTips_simTrees,5,2))) names(Cont_chars)<-paste0("Cont",1:ncont) names(Discrete_chars)<-paste0("Disc",1:ndiscrete) Data=cbind.data.frame(tips=Stree$tip.label,Discrete_chars, Cont_chars) Data$tips <-as.character(Data$tips) Data$tips[sample(1:nTips_simTrees,unmatched)]<- paste0("NEW", 1:unmatched) ##Functions for subseting data td <- suppressWarnings(suppressMessages(as.treedata.table(Stree, Data))) require(treedata.table) tdt<-function(){ td$dat[Disc1 == "A", .(sum(Cont2), mean(Cont3)), by = Disc10] } tdplyr <- make.treedata(Stree, Data) treeplyr<-function(){ tdplyr$dat %>% group_by(Disc10) %>% filter(Disc1 == "A") %>% summarise(sum(Cont2), mean(Cont3)) } tree2<-drop.tip(Stree,name.check(Stree,data.names = Data$tips)$tree_not_data) Data2 <-Data %>% filter(tips %nin% name.check(Stree,data.names = Data$tips)$data_not_tree) Data3<-data.table(Data2) dt<-function(){ Data3[Disc1 == "A", .(sum(Cont2), mean(Cont3)), by = Disc10] } dplyr<-function(){ Data2 %>% group_by(Disc10) %>% filter(Disc1 == "A") %>% summarise(sum(Cont2), mean(Cont3)) } tree2<-drop.tip(Stree,name.check(Stree,data.names = Data$tips)$tree_not_data) Data2<-Data[!Data$tips %in% name.check(Stree,data.names = Data$tips)$data_not_tree,] base<-function(){ res<-Data2[Data2$Disc1 =="A",c("Cont2", "Cont3","Disc10")] sum2 <- aggregate(list(res[,1]), by = list(res$Disc10), sum ) ave3 <- aggregate(list(res[,2]), by = list(res$Disc10), mean ) res<-merge(sum2,ave3, by="Group.1") colnames(res)<-c("Disc10" ,"sum(Cont2)" ,"mean(Cont3)") res } Bm_sim_subset<-suppressWarnings(suppressMessages( microbenchmark(tdt(),dt(),treeplyr(),dplyr(),base()))) b<-summary(Bm_sim_subset) return(b) } ######################################################## ############# treeplyr and treedata.table############### ######################################################## test_TDT_match<-function(nTips_simTrees, ndiscrete, ncont, unmatched){ ##Create data Stree<-rtree(nTips_simTrees) Discrete_chars<-as.data.frame(sapply(1:ndiscrete, function(x) sample(c("A","B","C"), nTips_simTrees,replace = T))) Cont_chars<-as.data.frame(sapply(1:ncont, function(x) rnorm(nTips_simTrees,5,2))) names(Cont_chars)<-paste0("Cont",1:ncont) names(Discrete_chars)<-paste0("Disc",1:ndiscrete) Data=cbind.data.frame(tips=Stree$tip.label,Discrete_chars, Cont_chars) Data$tips <-as.character(Data$tips) Data$tips[sample(1:nTips_simTrees,unmatched)]<- paste0("NEW", 1:unmatched) ##Functions for subseting data require(treedata.table) tdt<-function(){ suppressWarnings(suppressMessages(as.treedata.table(Stree, Data))) } treeplyr<-function(){ make.treedata(Stree, Data) } Bm_sim_subset<-suppressWarnings(suppressMessages( microbenchmark(tdt(),treeplyr()))) b<-summary(Bm_sim_subset) return(b) } ######################################################## #####################Analyses########################### ######################################################## ######################################################## ##Testing data manipulation ntips<-c(1000, 10000, 500000) nchar<-c(100) unma<-c(0.1) resultsBM<-lapply(seq_along(ntips), function(x){ do.call(rbind, lapply(seq_along(nchar), function(y){ re2<-do.call(rbind, pblapply(seq_along(unma), function(z){ re<-test_TDT(nTips_simTrees=ntips[x], ndiscrete=nchar[y]/2, ncont=nchar[y]/2, unmatched=ntips[x]*unma[z]) cbind.data.frame(nTips_simTrees=ntips[x], nchar=nchar[y], unmatched=unma[z], re) })) })) }) resultsBM_fin<-do.call(rbind, resultsBM) resultsBM_fin2<-resultsBM_fin[resultsBM_fin$nchar==100,] resultsBM_fin2$expr<-as.character(resultsBM_fin2$expr) resultsBM_fin2$expr<-rep(c("treedata.table","data.table","treeplyr", "dplyr","base"),3) resultsBM_fin2$expr <- factor(resultsBM_fin2$expr, levels = c("treedata.table","data.table","treeplyr", "dplyr","base")) resultsBM_fin2$color=ifelse(resultsBM_fin2$expr == "treedata.table", "black", "white") resultsBM_fin2$nTips_simTrees<-paste0("Observations=",as.integer(resultsBM_fin2$nTips_simTrees)) ##Plot p<-ggplot(resultsBM_fin2, aes(x=expr, y=median, fill=color))+ geom_bar(stat="identity")+facet_wrap(~factor(nTips_simTrees), scales="free")+ guides(fill = FALSE) + theme_bw() + scale_fill_manual(values=c("black","grey"))+xlab("")+ ylab("Median time\n(milliseconds)")+ theme(axis.text.x=element_text(angle=90, hjust=1))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) ##Testing for tree/data matching ntips<-c(10, 40, 100) nchar<-c(100) unma<-c(0.1) resultsBM_match<-lapply(seq_along(ntips), function(x){ do.call(rbind, lapply(seq_along(nchar), function(y){ re2<-do.call(rbind, pblapply(seq_along(unma), function(z){ re<-test_TDT_match(nTips_simTrees=ntips[x], ndiscrete=nchar[y]/2, ncont=nchar[y]/2, unmatched=ntips[x]*unma[z]) cbind.data.frame(nTips_simTrees=ntips[x], nchar=nchar[y], unmatched=unma[z], re) })) })) }) resultsBM_match_fin<-do.call(rbind, resultsBM_match) ##Plot resultsBM_match_fin2<-resultsBM_match_fin[resultsBM_match_fin$nchar==100,] resultsBM_match_fin2$expr<-as.character(resultsBM_match_fin2$expr) resultsBM_match_fin2$expr<-rep(c("treedata.table","treeplyr"),3) resultsBM_match_fin2$expr <- factor(resultsBM_match_fin2$expr, levels = c("treedata.table","treeplyr")) resultsBM_match_fin2$color=ifelse(resultsBM_match_fin2$expr == "treedata.table", "black", "white") resultsBM_match_fin2$nTips_simTrees<-paste0("Number of tips=",as.integer(resultsBM_match_fin2$nTips_simTrees)) resultsBM_match_fin2$nTips_simTrees<-factor(resultsBM_match_fin2$nTips_simTrees, levels = c("Number of tips=10","Number of tips=40","Number of tips=100")) q<-ggplot(resultsBM_match_fin2, aes(x=expr, y=median, fill=color))+ geom_bar(stat="identity")+facet_wrap(~factor(nTips_simTrees))+ guides(fill = FALSE) + theme_bw() + scale_fill_manual(values=c("black","grey"))+xlab("")+ ylab("Median time\n(milliseconds)")+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) pdf("Bench.tdt.pdf", 6,5) ggarrange(p, q, nrow = 2, labels = c("A", "B")) dev.off() ##Additional figures fig2<-ggplot(resultsBM_fin2, aes(x=expr, y=median, colour=expr)) + geom_errorbar(aes(ymin=lq, ymax=uq), width=0.5, size=.8) +facet_wrap(~factor(nTips_simTrees), scales="free")+ guides(colour = FALSE) +xlab("")+ ylab("Time (ms)")+ theme(axis.text.x=element_text(angle=90, hjust=1))+ geom_point() fig3<-ggplot(resultsBM_match_fin2, aes(x=expr, y=median, colour=expr)) + geom_errorbar(aes(ymin=lq, ymax=uq), width=.4, size=0.8) +facet_wrap(~factor(nTips_simTrees), scales="free")+ guides(colour = FALSE) +xlab("")+ ylab("Time (ms)")+ geom_point() pdf('Fig.3.pdf',6,4) fig2 dev.off() pdf('Fig.2.pdf',7,4) fig3 dev.off()