I need to simulate balanced and unbalanced phylogenetic trees for some research I am doing. In order to do this, I do rejection sampling: simulate a tree -> measure tree shape -> reject if not balanced or unbalanced enough. But what is enough? We need to define some cutoff value to determine what will be our set of balanced and unbalanced trees.
1 foo <- function(x, metric = "colless") {
2 if (metric == "colless") {
3 xx <- as.treeshape(x) # convert to apTreeshape format
4 colless(xx, "yule") # calculate colless' metric
5 } else if (metric == "gamma") {
6 gammaStat(x)
7 } else stop("metric should be one of colless or gamma")
8 }
9
10 theme_myblank <- function() {
11 stopifnot(require(ggplot2))
12 theme_blank <- ggplot2::theme_blank
13 ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
14 panel.background = element_blank(), plot.background = element_blank(),
15 axis.title.x = element_text(colour = NA), axis.title.y = element_blank(),
16 axis.text.x = element_blank(), axis.text.y = element_blank(), axis.line = element_blank(),
17 axis.ticks = element_blank())
18 }
1 library(ape)
2 library(phytools)
3
4 numtrees <- 1000 # lets simulate 1000 trees
5 trees <- pbtree(n = 50, nsim = numtrees, ape = F) # simulate 500 pure-birth trees with 100 spp each, ape = F makes it run faster
1 library(plyr)
2 library(apTreeshape)
3
4 colless_df <- ldply(trees, foo, metric = "colless") # calculate metric for each tree
5 head(colless_df)
V1
1 -0.1761
2 0.2839
3 0.4639
4 0.9439
5 -0.6961
6 -0.1161
1 # Calculate the percent of trees that will fall into the cutoff for balanced and unbalanced trees
2 col_percent_low <- round(length(colless_df[colless_df$V1 < -0.7, "V1"])/numtrees, 2) * 100
3 col_percent_high <- round(length(colless_df[colless_df$V1 > 0.7, "V1"])/numtrees, 2) * 100
1 library(ggplot2)
2
3 a <- ggplot(colless_df, aes(V1)) + # plot histogram of distribution of values
4 geom_histogram() +
5 theme_bw(base_size=18) +
6 scale_x_continuous(limits=c(-3,3), breaks=c(-3,-2,-1,0,1,2,3)) +
7 geom_vline(xintercept = -0.7, colour="red", linetype = "longdash") +
8 geom_vline(xintercept = 0.7, colour="red", linetype = "longdash") +
9 ggtitle(paste0("Distribution of Colless' metric for 1000 trees, cutoffs at -0.7 and 0.7 results in\n ", col_percent_low, "% (", numtrees*(col_percent_low/100), ") 'balanced' trees (left) and ", col_percent_low, "% (", numtrees*(col_percent_low/100), ") 'unbalanced' trees (right)")) +
10 labs(x = "Colless' Metric Value", y = "Number of phylogenetic trees") +
11 theme(plot.title = element_text(size = 16))
12
13 a

1 library(ggphylo)
2
3 b <- ggphylo(trees[which.min(colless_df$V1)], do.plot = F) + theme_myblank()
4 c <- ggphylo(trees[which.max(colless_df$V1)], do.plot = F) + theme_myblank()
5
6 b

1 library(gridExtra)
2
3 grid.newpage()
4 pushViewport(viewport(layout = grid.layout(1, 1)))
5 vpa_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.49)
6 vpb_ <- viewport(width = 0.35, height = 0.35, x = 0.23, y = 0.7)
7 vpc_ <- viewport(width = 0.35, height = 0.35, x = 0.82, y = 0.7)
8 print(a, vp = vpa_)
9 print(b, vp = vpb_)
10 print(c, vp = vpc_)

1 gamma_df <- ldply(trees, foo, metric="gamma") # calculate metric for each tree
2 gam_percent_low <- round(length(gamma_df[gamma_df$V1 < -1, "V1"])/numtrees, 2)*100
3 gam_percent_high <- round(length(gamma_df[gamma_df$V1 > 1, "V1"])/numtrees, 2)*100
4 a <- ggplot(gamma_df, aes(V1)) + # plot histogram of distribution of values
5 geom_histogram() +
6 theme_bw(base_size=18) +
7 scale_x_continuous(breaks=c(-3,-2,-1,0,1,2,3)) +
8 geom_vline(xintercept = -1, colour="red", linetype = "longdash") +
9 geom_vline(xintercept = 1, colour="red", linetype = "longdash") +
10 ggtitle(paste0("Distribution of Gamma metric for 1000 trees, cutoffs at -1 and 1 results in\n ", gam_percent_low, "% (", numtrees*(gam_percent_low/100), ") trees with deeper nodes (left) and ", gam_percent_high, "% (", numtrees*(gam_percent_high/100), ") trees with shallower nodes (right)")) +
11 labs(x = "Gamma Metric Value", y = "Number of phylogenetic trees") +
12 theme(plot.title = element_text(size = 16))
13 b <- ggphylo(trees[which.min(gamma_df$V1)], do.plot=F) + theme_myblank()
14 c <- ggphylo(trees[which.max(gamma_df$V1)], do.plot=F) + theme_myblank()
15
16 grid.newpage()
17 pushViewport(viewport(layout = grid.layout(1,1)))
18 vpa_ <- viewport(width = 1, height = 1, x = 0.5, y = 0.49)
19 vpb_ <- viewport(width = 0.35, height = 0.35, x = 0.23, y = 0.7)
20 vpc_ <- viewport(width = 0.35, height = 0.35, x = 0.82, y = 0.7)
21 print(a, vp = vpa_)
22 print(b, vp = vpb_)
23 print(c, vp = vpc_)
