Chapter 5 Phylogenetic Analysis

5.0.1 Load data

load("data/data.Rdata")

5.1 Getphylo tree

# Merge tip data with tree
tip_df <- data.frame(label = getphylo_tree$tip.label) %>%
  left_join(genome_metadata, by = c("label" = "ID"))

p <- ggtree(getphylo_tree) %<+% tip_df +
  geom_tiplab(aes(color = source), size = 3) +
  theme_tree2() +
  scale_color_manual(values = source_colors)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?

#Remove GCA_015060925.1
getphylo_tree_trimmed <- drop.tip(getphylo_tree, "GCA_015060925.1")

tip_df_trimmed <- tip_df %>% 
  filter(label != "GCA_015060925.1")

p <- ggtree(getphylo_tree_trimmed) %<+% tip_df_trimmed +
  geom_tiplab(aes(color = source), size = 3) +
  theme_tree2() +
  scale_color_manual(values = source_colors)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?

Circular tree with genome size and completeness

# Generate  basal tree
circular_tree <- force.ultrametric(getphylo_tree_trimmed, method="extend") %>% # extend to ultrametric for the sake of visualization
    ggtree(., layout="fan", open.angle=10, size=0.5)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=ID, fill=contamination),
                offset = 0.24,
                pwidth = 0.1,
                orientation="y",
              stat="identity")+
  labs(fill="Contamination")

# Add genome-size ring
circular_tree <-  circular_tree + new_scale_fill()

circular_tree <-  circular_tree +
  geom_fruit(
    data=genome_metadata,
    geom=geom_bar,
    mapping = aes(x=genome_size, y=ID),
    fill = "#1e6e55",
    offset = 0.05,
    orientation="y",
    stat="identity")



#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

Circular tree with host and source

# Generate  basal tree
circular_tree <- force.ultrametric(getphylo_tree_trimmed, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.4)

***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************

#Host species ring
circular_tree <- circular_tree +
  new_scale_fill() +
  geom_fruit(
    data = genome_metadata,
    geom = geom_tile,
    mapping = aes(y = ID, fill = host_species),
    width = 0.005,
    offset = 0.10
  ) +
  scale_fill_brewer(palette = "Set3") +
  labs(fill = "Host")

#source ring
circular_tree <- circular_tree +
  new_scale_fill() +
  geom_fruit(
    data = genome_metadata,
    geom = geom_tile,
    mapping = aes(y = ID, fill = source),
    width = 0.005,
    offset = 0.15
  ) +
  scale_fill_manual(values = source_colors) +
  labs(fill = "Source")

# Plot
circular_tree %>% open_tree(30) %>% rotate_tree(90)

5.2 dRep tree

fastani_comparisons <- read_csv("data/Ndb.csv")  #Secondary comparison results

Rows: 8650 Columns: 5
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): reference, querry
dbl (3): ani, alignment_coverage, primary_cluster

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

genomes <- unique(c(fastani_comparisons$reference, fastani_comparisons$querry))

# Create empty matrix
ani_matrix <- matrix(0, nrow = length(genomes), ncol = length(genomes))
rownames(ani_matrix) <- genomes
colnames(ani_matrix) <- genomes

# Fill the matrix
for(i in 1:nrow(fastani_comparisons)) {
  ref <- fastani_comparisons$reference[i]
  qry <- fastani_comparisons$querry[i]
  ani_val <- fastani_comparisons$ani[i]
  
  ani_matrix[ref, qry] <- ani_val
  ani_matrix[qry, ref] <- ani_val  # Make symmetric
}

# Set diagonal to 100
diag(ani_matrix) <- 100

# Convert to distance matrix (for tree building)
dist_matrix <- as.dist(1 - ani_matrix)

# Build tree
hc <- hclust(dist_matrix, method = "average") #drep uses hierarchical clustering
tree <- as.phylo(hc)

ggtree(tree) + 
  geom_tiplab(size = 2) +
  theme_tree2()

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?

#remove this outlier that is >95% different (it must be another species)
outlier <- "GCA_015060925.1_ASM1506092v1_genomic.fna"

fastani_filtered <- fastani_comparisons %>%
  filter(reference != outlier,
         querry != outlier)

#create the matrix
genomes <- unique(c(fastani_filtered$reference,
                    fastani_filtered$querry))

ani_matrix <- matrix(0,
                     nrow = length(genomes),
                     ncol = length(genomes),
                     dimnames = list(genomes, genomes))

#fill in the matrix with the distances
for(i in seq_len(nrow(fastani_filtered))) {
  ref <- fastani_filtered$reference[i]
  qry <- fastani_filtered$querry[i]
  ani <- fastani_filtered$ani[i]

  ani_matrix[ref, qry] <- ani
  ani_matrix[qry, ref] <- ani
}

diag(ani_matrix) <- 1


dist_matrix <- as.dist(1 - ani_matrix )

#hierarchical clustering ti generate the tree
hc <- hclust(dist_matrix, method = "average")
tree <- as.phylo(hc)

#use the short labels 
tip_df <- tibble(
  label = tree$tip.label,
label_clean = sub("\\.fna$|\\.fa$", "", tree$tip.label)
)

tip_df <- tip_df %>%
  left_join(genome_metadata, by = c("label_clean" = "mag_name"))

tree$tip.label <- ifelse(
  is.na(tip_df$ID),
  tip_df$label_clean,
  tip_df$ID
)

#check
sum(is.na(tip_df$ID))

[1] 27

tips_to_remove <- tree$tip.label[is.na(tip_df$ID)]
tree <- ape::drop.tip(tree, tips_to_remove)

# Update tip_df to match the pruned tree
tip_df <- tip_df %>%
  filter(!is.na(ID))

tree_reversed <- tree
# Get the maximum distance (root to furthest tip)
max_dist <- max(node.depth.edgelength(tree))
# Reverse: make tips = 0 and root = max
tree_reversed$edge.length <- tree$edge.length
node_depths <- node.depth.edgelength(tree)
# Calculate new positions (invert from tips)
for(i in 1:length(tree_reversed$edge.length)) {
  tree_reversed$edge.length[i] <- tree$edge.length[i]
}

p <- ggtree(tree) +
  geom_tiplab(size = 2, hjust =  0.8)+
  scale_x_continuous(
    breaks = seq(0, 0.02, by = 0.002),
    labels = function(x) round(100 * (1 - x), 1),  # Reverse the labels
    trans = "reverse"  # Reverse the axis
  ) +
  coord_cartesian(xlim = c(0.02, 0)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2()

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?

# Create a proper matching data frame
tip_data_for_plot <- data.frame(
  label = tree$tip.label,
  stringsAsFactors = FALSE
)

# Match with metadata
tip_data_for_plot <- tip_data_for_plot %>%
  left_join(
    tip_df %>% dplyr::select(ID, source, host_species, country, locality, continent, completeness),
    by = c("label" = "ID")
  )

# Get the actual x-range of the tree
tree_data <- fortify(tree)
max_x <- max(tree_data$x[tree_data$isTip])

# Plot
p <- ggtree(tree, size = 1) +  
  geom_vline(xintercept = max_x - 0.005,  # 99.5% ANI (0.005 from tips)
             linetype = "dashed", 
             color = "red", 
             size = 0.8) +
  geom_tippoint(aes(color = source), size = 3) +
  geom_tiplab(aes(color = source),
              size = 4,
              hjust = -0.1)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 1
ℹ Did you misspell an argument name?

p <- p %<+% tip_data_for_plot +
  scale_color_manual(values = source_colors, name = "Source") +
  scale_x_continuous(
    labels = function(x) round(100 * (1 - (max_x - x)), 1)  # Convert based on distance from tips
  ) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )
p

# Plot
p <- ggtree(tree, size = 1) +  
  geom_tippoint(aes(color = source), size = 3) +
  geom_tiplab(aes(color = source),
              size = 4,
              hjust = -0.1)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 1
ℹ Did you misspell an argument name?

p <- p %<+% tip_data_for_plot +
  scale_color_manual(values = source_colors, name = "Source") +
  scale_x_continuous(
    labels = function(x) round(100 * (1 - (max_x - x)), 1)  # Convert based on distance from tips
  ) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )
p

# Convert phylo tree to hclust object
hclust_tree <- as.hclust.phylo(tree)

# Cut at 95% similarity (5% distance)
# The height corresponds to genetic distance
# For 95% similarity = 0.05 distance from tips
cut_height <- 0.05

# Cut the tree
groups <- cutree(hclust_tree, h = cut_height)

# Add group information to your tip data
tip_data_for_plot$group <- groups[match(tip_data_for_plot$label, names(groups))]

# See how many groups you have
table(groups)

groups
 1 
66

# Visualize with groups highlighted
p <- ggtree(tree, size = 1) +  
  geom_vline(xintercept = max_x - 0.05,  # 95% ANI (0.05 from tips)
             linetype = "dashed", 
             color = "red", 
             size = 0.8) +
  geom_tippoint(aes(color = source, shape = as.factor(group)), size = 3) +
  geom_tiplab(aes(color = source),
              size = 4,
              hjust = -0.1)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 1
ℹ Did you misspell an argument name?

p <- p %<+% tip_data_for_plot +
  scale_color_manual(values = source_colors, name = "Source") +
  scale_shape_manual(values = rep(15:18, length.out = max(groups)), 
                     name = "95% ANI Group") +
  scale_x_continuous(
    labels = function(x) round(100 * (1 - (max_x - x)), 1)
  ) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )
p

# Get the actual x-range of the tree
tree_data <- fortify(tree)
max_x <- max(tree_data$x[tree_data$isTip])

# Plot
p <- ggtree(tree, size = 1) +  
  geom_vline(xintercept = max_x - 0.005,  # 99.5% ANI (0.005 from tips)
             linetype = "dashed", 
             color = "red", 
             size = 0.8) +
  geom_tippoint(aes(color = host_species), size = 3) +
  geom_tiplab(aes(color = host_species),
              size = 4,
              hjust = -0.1)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 1
ℹ Did you misspell an argument name?

p <- p %<+% tip_data_for_plot +
  scale_x_continuous(
    labels = function(x) round(100 * (1 - (max_x - x)), 1)  # Convert based on distance from tips
  ) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )
p

library(ggtree)
library(ggnewscale)

# Ensure rownames match tip labels
rownames(tip_data_for_plot) <- tip_data_for_plot$tip_label

p <- ggtree(tree, size = 1) +
  geom_tippoint(aes(color = source), size = 3) +
  geom_tiplab(aes(color = source), size = 4, hjust = -0.1) +
  scale_color_manual(values = source_colors, name = "Source")

# 1) completeness
p <- gheatmap(
  p,
  tip_data_for_plot["completeness"],
  offset = 0.05,
  width = 0.12,
  colnames_position = "top"
) +
  scale_fill_gradient(name = "Completeness", low = "white", high = "darkgreen")


p <- p + ggnewscale::new_scale_fill()

# 2) country
p <- gheatmap(
  p,
  tip_data_for_plot["country"],
  offset = 0.17,
  width = 0.12,
  colnames_position = "top"
) +
  scale_fill_manual(name = "Country", values = country_colors)


p <- p + ggnewscale::new_scale_fill()

# 3) host_species
p <- gheatmap(
  p,
  tip_data_for_plot["host_species"],
  offset = 0.29,
  width = 0.12,
  colnames_position = "top"
) +
  scale_fill_manual(name = "Host species", values = host_colors)

# final x-axis
p <- p +
  scale_x_continuous(labels = function(x) round(100 * (1 - (max_x - x)), 1)) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )

p

library(ggtree)
library(ggtreeExtra)
library(ggplot2)
library(ggnewscale)

p <- ggtree(tree, layout = "circular") %<+% tip_data_for_plot +
  geom_tippoint(aes(color = host_species), size = 2)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?

# Country (discrete)
p <- p +
  geom_fruit(
    geom = geom_tile,
    mapping = aes(y = label, fill = country),
    offset = 0.03,
    pwidth = 0.02,
    width = 0.02
  ) +
  scale_fill_discrete(name = "Country")

p <- p + ggnewscale::new_scale_fill()

#Source (discrete) 
p <- p +
  geom_fruit(
    geom = geom_tile,
    mapping = aes(y = label, fill = source),
    offset = 0.04,
    pwidth = 0.02,
    width = 0.02
  ) +
  scale_fill_discrete(name = "Source")

p <- p + ggnewscale::new_scale_fill()

#Completeness
p <- p +
  geom_fruit(
    geom = geom_tile,
    mapping = aes(y = label, fill = completeness),
    offset = 0.05,
    pwidth = 0.01,
    width = 0.02
  ) +
  scale_fill_viridis_c(name = "Completeness (%)")


p <- p + ggplot2::theme_void()

p

# Get the actual x-range of the tree
tree_data <- fortify(tree)
max_x <- max(tree_data$x[tree_data$isTip])

# Plot
p <- ggtree(tree, size = 1) +  
  geom_vline(xintercept = max_x - 0.005,  # 99.5% ANI (0.005 from tips)
             linetype = "dashed", 
             color = "red", 
             size = 0.8) +
  geom_tippoint(aes(color = continent), size = 3) +
  geom_tiplab(aes(color = continent),
              size = 4,
              hjust = -0.1)

Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 1
ℹ Did you misspell an argument name?

p <- p %<+% tip_data_for_plot +
  scale_x_continuous(
    labels = function(x) round(100 * (1 - (max_x - x)), 1)  # Convert based on distance from tips
  ) +
  coord_cartesian(xlim = c(0, max_x + 0.005)) +
  labs(x = "Average Nucleotide Identity (ANI, %)") +
  theme_tree2() +
  theme(
    legend.position = "right",
    plot.margin = margin(5, 100, 5, 5),
    axis.text.x = element_text(size = 14),
    axis.title.x = element_text(size = 16)
  )
p

5.3 Load pangenome data

PPanggolin

gene_matrix <- read.table(
  "data/gene_presence_absence.Rtab",
  header = TRUE,
  row.names = 1
)

# remove the non-parabacteroides MAG
gene_matrix <- gene_matrix[, colnames(gene_matrix) != "GCA_015060925.1_ASM1506092v1_genomic"]

gene_matrix_transposed <- gene_matrix %>% t()

pca_gene_matrix <- prcomp(gene_matrix_transposed)

plot_pca_gene_matrix <- as_tibble(pca_gene_matrix$x)%>% bind_cols(genome_metadata)

percent_variance <- summary(pca_gene_matrix)$importance["Proportion of Variance",] * 100

ggplot( plot_pca_gene_matrix, aes(x=PC1, y=PC2, col=host_species, label = ID)) + 
  geom_point() + 
  geom_text(hjust=0, vjust=0, size=2) +
  xlab(paste("PC1 ", percent_variance[1], "%"))+ 
  ylab(label = paste("PC2 ", percent_variance[2], "%")) +
  theme_bw()

# Removing outlier
outlier_to_remove <- "GCA_015060925.1" 
gene_matrix_filtered <- gene_matrix[, !colnames(gene_matrix) %in% outlier_to_remove]

# Rerun PCA without outlier
gene_matrix_transposed_filtered <- gene_matrix_filtered %>% t()
pca_gene_matrix_filtered <- prcomp(gene_matrix_transposed_filtered)
plot_pca_filtered <- as_tibble(pca_gene_matrix_filtered$x) %>% 
  bind_cols(genome_metadata %>% filter(!rownames(.) %in% outlier_to_remove))
percent_variance_filtered <- summary(pca_gene_matrix_filtered)$importance["Proportion of Variance",] * 100

plot_pca_filtered <- plot_pca_filtered %>%
  mutate(
    collection_date = as.Date(ncbi_date),  
    date_numeric = as.numeric(ncbi_date)
  )

# Plot colored by date with gradient
ggplot(plot_pca_filtered, aes(x=PC1, y=PC2, color=collection_date)) + 
  geom_point(size=3) + 
  scale_color_viridis_c(option="viridis") +  # or use "plasma", "inferno", etc.
  xlab(paste("PC1 ", percent_variance_filtered[1], "%"))+ 
  ylab(paste("PC2 ", percent_variance_filtered[2], "%")) +
  labs(color="Collection Date") +
  theme_bw()

mapping_from_tip_df <- setNames(tip_df$ID, tip_df$label_clean)

# Check the mapping
cat("First 10 mappings from tip_df:\n")

First 10 mappings from tip_df:

head(mapping_from_tip_df, 10)

 GCA_003464145.1_ASM346414v1_genomic  GCA_008679285.1_ASM867928v1_genomic  GCA_008679655.1_ASM867965v1_genomic 
                   "GCA_003464145.1"                    "GCA_008679285.1"                    "GCA_008679655.1" 
 GCA_008680675.1_ASM868067v1_genomic  GCA_009774835.1_ASM977483v1_genomic GCA_014870645.1_ASM1487064v1_genomic 
                   "GCA_008680675.1"                    "GCA_009774835.1"                    "GCA_014870645.1" 
GCA_020809955.1_ASM2080995v1_genomic GCA_022771305.1_ASM2277130v1_genomic GCA_022774455.1_ASM2277445v1_genomic 
                   "GCA_020809955.1"                    "GCA_022771305.1"                    "GCA_022774455.1" 
GCA_022776965.1_ASM2277696v1_genomic 
                   "GCA_022776965.1"

# How many gene_matrix columns match?
cat("\nGene_matrix columns in tip_df mapping:", 
    sum(colnames(gene_matrix) %in% names(mapping_from_tip_df)), "\n")


Gene_matrix columns in tip_df mapping: 66

# Which gene_matrix columns are NOT in tip_df?
not_in_tipdf <- colnames(gene_matrix)[!colnames(gene_matrix) %in% names(mapping_from_tip_df)]
cat("Not in tip_df (", length(not_in_tipdf), "):\n")

Not in tip_df ( 27 ):

print(head(not_in_tipdf, 20))

 [1] "EHA00531_bin.1"   "EHA00535_bin.6"   "EHA00539_bin.6"   "EHA00726_bin.13"  "EHA00928_bin.90"  "EHA01202_bin.66" 
 [7] "EHA01281_bin.18"  "EHA01439_bin.44"  "EHA01634_bin.14"  "EHA01845_bin.103" "EHA01958_bin.190" "EHA02500_bin.116"
[13] "EHA03161_bin.52"  "EHA03288_bin.14"  "EHA03306_bin.55"  "EHA03349_bin.40"  "EHA03351_bin.43"  "EHA03376_bin.12" 
[19] "EHA03390_bin.20"  "EHA04575_bin.10"

# Apply the mapping from tip_df
new_colnames <- mapping_from_tip_df[colnames(gene_matrix)]

cat("\nNumber of successful mappings:", sum(!is.na(new_colnames)), "\n")


Number of successful mappings: 66

cat("Number of NAs:", sum(is.na(new_colnames)), "\n")

Number of NAs: 27

# Apply new names
colnames(gene_matrix) <- ifelse(
  is.na(new_colnames),
  colnames(gene_matrix),  # Keep original if no mapping
  new_colnames            # Use mapped EHM ID
)

# Now check matching with tree
cat("\nAfter mapping - columns matching tree:", 
    sum(colnames(gene_matrix) %in% tree$tip.label), "\n")


After mapping - columns matching tree: 66

# Common genomes
common_genomes <- intersect(colnames(gene_matrix), tree$tip.label)
cat("Common genomes:", length(common_genomes), "\n")

Common genomes: 66

# Should be 91 now (or close to it)!
gene_matrix_filtered <- gene_matrix[, common_genomes]
gene_matrix_t <- t(gene_matrix_filtered)
gene_matrix_t <- gene_matrix_t[tree$tip.label, ]

cat("\nFinal verification - Perfect match?", 
    all(rownames(gene_matrix_t) == tree$tip.label), "\n")


Final verification - Perfect match? TRUE

hc <- as.hclust(tree)

ann_colors <- list(
  source = source_colors  
)

pheatmap(
  gene_matrix_t,
  cluster_rows = hc,     
  cluster_cols = TRUE,
  show_rownames = TRUE,
  show_colnames = FALSE,
  fontsize = 5,
  color = c("white", "blue")
)

annotation_row <- data.frame(
  source = genome_metadata$source
)
rownames(annotation_row) <- genome_metadata$ID

ann_colors <- list(
  source = source_colors  
)

# keep only rows in the matrix
annotation_row <- annotation_row[rownames(gene_matrix_t), , drop = FALSE]

pheatmap(
  gene_matrix_t,
  cluster_rows = hc,
  cluster_cols = TRUE,
  treeheight_row = 120,
  show_rownames = TRUE,
  show_colnames = FALSE,
  fontsize = 5,
  color = c("white", "blue"),
  annotation_row = annotation_row,
  annotation_colors = ann_colors,
  legend_breaks = c(0, 1),
  legend_labels = c("Absent", "Present")
)

library(ComplexHeatmap)

Cargando paquete requerido: grid

========================================
ComplexHeatmap version 2.20.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
    genomic data. Bioinformatics 2016.


The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
   in the original pheatmap() are identically supported in the new function. You 
   can still use the original function by explicitly calling pheatmap::pheatmap().


Adjuntando el paquete: 'ComplexHeatmap'

The following object is masked from 'package:pheatmap':

    pheatmap

The following object is masked from 'package:R.utils':

    draw

library(circlize)

Warning: package 'circlize' was built under R version 4.4.3

========================================
circlize version 0.4.17
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/

If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
  in R. Bioinformatics 2014.

This message can be suppressed by:
  suppressPackageStartupMessages(library(circlize))
========================================


Adjuntando el paquete: 'circlize'

The following object is masked from 'package:ape':

    degree

annotation_row <- data.frame(
  source = genome_metadata$source
)
rownames(annotation_row) <- genome_metadata$ID
annotation_row <- annotation_row[rownames(gene_matrix_t), , drop = FALSE]

# Heatmap colors
col_fun <- colorRamp2(c(0, 1), c("white", "blue"))

# Row annotation
row_ha <- rowAnnotation(
  source = annotation_row$source,
  col = list(source = ann_colors$source)
)

Heatmap(
  gene_matrix_t,
  name = "Gene presence",
  col = col_fun,
  cluster_rows = hc,
  cluster_columns = TRUE,
  show_row_names = TRUE,
  show_column_names = FALSE,
  row_names_gp = gpar(fontsize = 6),
  left_annotation = row_ha
)

`use_raster` is automatically set to TRUE for a matrix with more than 2000 columns You can control `use_raster`
argument by explicitly setting TRUE/FALSE to it.

Set `ht_opt$message = FALSE` to turn off this message.

'magick' package is suggested to install to give better rasterization.

Set `ht_opt$message = FALSE` to turn off this message.

5.4 Exploring ppanggolin outputs

functional_modules <- read_tsv("data/pangolin/functional_modules.tsv")
  gene_families <- read_tsv("data/pangolin/gene_families.tsv", col_names = FALSE)
mean_persistent_duplication <- read_tsv("data/pangolin/mean_persistent_duplication.tsv")
modules_in_genomes <- read_tsv("data/pangolin/modules_in_genomes.tsv")
modules_RGP_lists <- read_tsv("data/pangolin/modules_RGP_lists.tsv")
modules_spots <- read_tsv("data/pangolin/modules_spots.tsv")
modules_summary <- read_tsv("data/pangolin/modules_summary.tsv")
spot_borders <- read_tsv("data/pangolin/spot_borders.tsv")
spots <- read_tsv("data/pangolin/spots.tsv")
summarize_spots <- read_tsv("data/pangolin/summarize_spots.tsv")
genomes_statistics <- read_tsv("data/pangolin/genomes_statistics.tsv", comment= "#")
regions_of_genomic_plasticity <- read_tsv("data/pangolin/regions_of_genomic_plasticity.tsv")

genome_metadata <- genome_metadata %>%
  mutate(
    genome_size = case_when(
      ID == "GCF_001406015.1" & is.na(genome_size) ~ 5073478,
      ID == "GCF_002206325.1" & is.na(genome_size) ~ 4963958,
      TRUE ~ genome_size
    )
  )


genome_metadata <- genome_metadata %>%
  mutate(
    host_class   = if_else(source == "GTDB", "Mammalia", host_class),
    host_order   = if_else(source == "GTDB", "Primates", host_order),
    host_species = if_else(source == "GTDB", "Homo sapiens", host_species)
  )

excluded_mags <- c("GCA_015060925.1")

valid_mags <- genome_metadata %>%
  filter(!ID %in% excluded_mags) %>%
  pull(ID)

genome_annotations <- genome_annotations %>%
  filter(mag_id %in% valid_mags)

genome_metadata <- genome_metadata %>%
  filter(ID %in% valid_mags)


genome_gifts <- genome_gifts[rownames(genome_gifts) %in% valid_mags, ]