Chapter 6 Phylogenetic analysis

6.1 dRep tree

fastani_comparisons <- read_csv("data/mags_metadata/lactococcus_lactis_Ndb.csv")  #Secondary comparison results
Rows: 19600 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): reference, querry
dbl (3): ani, alignment_coverage, primary_cluster

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genomes <- unique(c(fastani_comparisons$reference, fastani_comparisons$querry))

# Create empty matrix
ani_matrix <- matrix(0, nrow = length(genomes), ncol = length(genomes))
rownames(ani_matrix) <- genomes
colnames(ani_matrix) <- genomes

# Fill the matrix
for(i in 1:nrow(fastani_comparisons)) {
  ref <- fastani_comparisons$reference[i]
  qry <- fastani_comparisons$querry[i]
  ani_val <- fastani_comparisons$ani[i]
  
  ani_matrix[ref, qry] <- ani_val
  ani_matrix[qry, ref] <- ani_val  # Make symmetric
}

# Set diagonal to 100
diag(ani_matrix) <- 100

# Convert to distance matrix (for tree building)
dist_matrix <- as.dist(1 - ani_matrix)

# Build tree
hc <- hclust(dist_matrix, method = "average") #drep uses hierarchical clustering
tree <- as.phylo(hc)

ggtree(tree) + 
  geom_tiplab(size = 2) +
  theme_tree2()
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

fastani_path <- "data/mags_metadata/lactococcus_lactis_Ndb.csv"

metadata <- genome_metadata
id_col   <- "ID"   

# Drop any outliers or specific IDs (use IDs without file suffix)
drop_ids <- character(0)  

# Impute missing ANI pairs 
# "max_distance" (most conservative), "min_ani", or a fixed numeric between 0 and 1
impute_mode <- "max_distance"

# helper: strip file extensions so IDs match metadata
clean_label <- function(x) {
  x <- basename(x)
  sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
}

# Load & normalize fastani table
fastani_raw <- readr::read_csv(fastani_path, show_col_types = FALSE)

fastani <- fastani_raw %>%
  dplyr::rename(
    reference = dplyr::any_of(c("reference", "ref", "Reference")),
    query     = dplyr::any_of(c("query", "querry", "Qry", "Query")),
    ani       = dplyr::any_of(c("ani", "ANI", "ani_1", "ANI_1"))
  ) %>%
  dplyr::select(reference, query, ani) %>%
  dplyr::filter(!is.na(reference), !is.na(query), reference != query)

# Normalize ANI to [0,1] if needed
ani_max <- max(fastani$ani, na.rm = TRUE)
if (is.finite(ani_max) && ani_max > 1) {
  fastani <- fastani %>% dplyr::mutate(ani = ani / 100)
}

# Clean IDs to remove extensions
fastani <- fastani %>%
  dplyr::mutate(
    reference = clean_label(reference),
    query     = clean_label(query)
  ) %>%
  dplyr::filter(reference != query)

# Remove duplicate pairs (keep the max ANI per pair)
fastani <- fastani %>%
  dplyr::group_by(reference, query) %>%
  dplyr::summarise(ani = max(ani), .groups = "drop")

# Optionally drop genomes (IDs without suffix)
if (length(drop_ids) > 0) {
  fastani <- fastani %>%
    dplyr::filter(!reference %in% drop_ids, !query %in% drop_ids)
}

# Build a symmetric ANI matrix with diag = 1 (dedup-safe)
build_ani_matrix <- function(df) {
  genomes <- sort(unique(c(df$reference, df$query)))

  both <- dplyr::bind_rows(
    df %>% dplyr::transmute(reference, query, ani),
    df %>% dplyr::transmute(reference = query, query = reference, ani)
  ) %>%
    dplyr::distinct(reference, query, .keep_all = TRUE)

  ani_mat <- both %>%
    tidyr::complete(reference = genomes, query = genomes) %>%
    tidyr::pivot_wider(
      names_from  = query,
      values_from = ani,
      values_fn   = max  # <-- ensure no duplicate cells cause errors
    ) %>%
    tibble::column_to_rownames("reference") %>%
    as.matrix()

  diag(ani_mat) <- 1

  # Impute missing if any
  if (anyNA(ani_mat)) {
    if (impute_mode == "max_distance") {
      fill_val <- min(ani_mat, na.rm = TRUE)
    } else if (impute_mode == "min_ani") {
      fill_val <- min(ani_mat, na.rm = TRUE)
    } else if (is.numeric(impute_mode) && impute_mode >= 0 && impute_mode <= 1) {
      fill_val <- impute_mode
    } else {
      fill_val <- min(ani_mat, na.rm = TRUE)
    }
    ani_mat[is.na(ani_mat)] <- fill_val
  }

  ani_mat
}

ani_mat <- build_ani_matrix(fastani)

# Build the tree (UPGMA/average as in dRep)
build_tree_from_ani <- function(ani_mat, method = "average") {
  dist_mat <- 1 - ani_mat
  hc <- hclust(as.dist(dist_mat), method = method)
  ape::as.phylo(hc)
}

tree <- build_tree_from_ani(ani_mat)

# De-duplicate metadata by ID and clean ID if needed
metadata_dedup <- metadata %>%
  dplyr::mutate(!!id_col := clean_label(.data[[id_col]])) %>%  # safe if already clean
  dplyr::distinct(.data[[id_col]], .keep_all = TRUE)

# Add metadata to tips (tree labels are already clean IDs)
make_tip_df <- function(tree, metadata_dedup, id_col) {
  stopifnot(id_col %in% names(metadata_dedup))
  tibble::tibble(label = tree$tip.label) %>%
    dplyr::left_join(metadata_dedup, by = setNames(id_col, "label")) %>%
    dplyr::mutate(label_clean = label)  # keep a consistent column if you use it in labels
}

tip_df <- make_tip_df(tree, metadata_dedup, id_col = id_col)

# Helper for ANI axis labels (unchanged)
ani_axis <- function(p, tree, show_threshold = NULL) {
  df <- ggtree::fortify(tree)
  max_x <- max(df$x[df$isTip])
  p <- p +
    scale_x_continuous(
      labels = function(x) round(100 * (1 - (max_x - x)), 1)
    ) +
    coord_cartesian(xlim = c(0, max_x + 0.005)) +
    labs(x = "Average Nucleotide Identity (ANI, %)") +
    theme_tree2()
  if (!is.null(show_threshold)) {
    thr_delta <- 1 - (show_threshold / 100)
    p <- p + geom_vline(xintercept = max_x - thr_delta,
                        linetype = "dashed", color = "red", size = 0.8)
  }
  p
}

# - BASIC PLOT --
plot_tree_basic <- function(tree, tip_df, color_by = NULL, label_tips = TRUE,
                            point_size = 2.5, show_threshold = NULL) {
  # ensure completeness numeric for continuous gradients
  if ("completeness" %in% names(tip_df)) {
    tip_df <- tip_df %>% dplyr::mutate(completeness = as.numeric(completeness))
  }

  p <- ggtree(tree, size = 0.8)
  p <- p %<+% tip_df

  if (!is.null(color_by) && color_by %in% names(tip_df)) {
    p <- p + geom_tippoint(aes(color = !!rlang::sym(color_by)), size = point_size)

    # Palette/scale logic
    if (identical(color_by, "source") && exists("source_colors")) {
      p <- p + scale_color_manual(values = source_colors, name = "Source", drop = FALSE)
    } else if (identical(color_by, "host_order") && exists("host_order_colors")) {
      p <- p + scale_color_manual(values = host_order_colors, name = "Host order", drop = FALSE)
    } else if (is.numeric(tip_df[[color_by]])) {
      p <- p + scale_color_gradient(low = "white", high = "#08306B",
                                    name = paste0(color_by, " (%)"))
    }
  } else {
    p <- p + geom_tippoint(size = point_size)
  }

  if (label_tips) {
    lab_col <- if ("label_clean" %in% names(tip_df)) "label_clean" else "label"
    p <- p + geom_tiplab(size = 3, hjust = -0.1, aes(label = .data[[lab_col]]))
  }

  ani_axis(p, tree, show_threshold = show_threshold)
}

# - MULTI-AESTHETIC PLOT --
plot_tree_multi <- function(tree, tip_df,
                            color_by = NULL,
                            shape_by = NULL,
                            size_by  = NULL,
                            label_tips = TRUE,
                            show_threshold = NULL) {
  if ("completeness" %in% names(tip_df)) {
    tip_df <- tip_df %>% dplyr::mutate(completeness = as.numeric(completeness))
  }

  p <- ggtree(tree, size = 0.8)
  p <- p %<+% tip_df

  aes_map <- aes()
  if (!is.null(color_by) && color_by %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(color = !!rlang::sym(color_by)))
  if (!is.null(shape_by) && shape_by %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(shape = !!rlang::sym(shape_by)))
  if (!is.null(size_by)  && size_by  %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(size  = !!rlang::sym(size_by)))

  p <- p + geom_tippoint(aes_map, stroke = 0.3)

  if (!is.null(color_by) && color_by %in% names(tip_df)) {
    if (identical(color_by, "source") && exists("source_colors")) {
      p <- p + scale_color_manual(values = source_colors, name = "Source", drop = FALSE)
    } else if (identical(color_by, "host_order") && exists("host_order_colors")) {
      p <- p + scale_color_manual(values = host_order_colors, name = "Host order", drop = FALSE)
    } else if (is.numeric(tip_df[[color_by]])) {
      p <- p + scale_color_gradient(low = "white", high = "#08306B",
                                    name = paste0(color_by, " (%)"))
    }
  }

  if (label_tips) {
    lab_col <- if ("label_clean" %in% names(tip_df)) "label_clean" else "label"
    p <- p + geom_tiplab(size = 3, hjust = -0.1, aes(label = .data[[lab_col]]))
  }

  ani_axis(p, tree, show_threshold = show_threshold)
}
p_source <- plot_tree_basic(tree, tip_df, color_by = "source", label_tips = TRUE,
                            show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_source

p_host <- plot_tree_basic(tree, tip_df, color_by = "host_species",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_host

p_host_order <- plot_tree_basic(tree, tip_df, color_by = "host_order",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_host_order

p_continent <- plot_tree_basic(tree, tip_df, color_by = "continent",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_continent

# Color tips by completeness (0–100; 100 = dark blue, 0 = white)
p_comp <- plot_tree_basic(tree, tip_df, color_by = "completeness",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_comp

p_multi <- plot_tree_multi(
  tree, tip_df,
  color_by = "host_order",
  size_by  = "completeness",
  label_tips = TRUE,
  show_threshold = 99.5
)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_multi
Warning: Removed 90 rows containing missing values or values outside the scale range (`geom_point_g_gtree()`).

ggsave("plots/tree_by_source.png", p_source, width = 10, height = 8, dpi = 300)
ggsave("plots/tree_by_host.png",   p_host,   width = 10, height = 8, dpi = 300)
ggsave("plots/tree_combo.png",     p_combo,  width = 12, height = 9, dpi = 300)
ggsave("plots/tree_full_heatmap.png", p_full, width = 12, height = 10, dpi = 300)