dRep tree
fastani_comparisons <- read_csv("data/mags_metadata/lactococcus_lactis_Ndb.csv") #Secondary comparison results
Rows: 19600 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): reference, querry
dbl (3): ani, alignment_coverage, primary_cluster
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genomes <- unique(c(fastani_comparisons$reference, fastani_comparisons$querry))
# Create empty matrix
ani_matrix <- matrix(0, nrow = length(genomes), ncol = length(genomes))
rownames(ani_matrix) <- genomes
colnames(ani_matrix) <- genomes
# Fill the matrix
for(i in 1:nrow(fastani_comparisons)) {
ref <- fastani_comparisons$reference[i]
qry <- fastani_comparisons$querry[i]
ani_val <- fastani_comparisons$ani[i]
ani_matrix[ref, qry] <- ani_val
ani_matrix[qry, ref] <- ani_val # Make symmetric
}
# Set diagonal to 100
diag(ani_matrix) <- 100
# Convert to distance matrix (for tree building)
dist_matrix <- as.dist(1 - ani_matrix)
# Build tree
hc <- hclust(dist_matrix, method = "average") #drep uses hierarchical clustering
tree <- as.phylo(hc)
ggtree(tree) +
geom_tiplab(size = 2) +
theme_tree2()
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

fastani_path <- "data/mags_metadata/lactococcus_lactis_Ndb.csv"
metadata <- genome_metadata
id_col <- "ID"
# Drop any outliers or specific IDs (use IDs without file suffix)
drop_ids <- character(0)
# Impute missing ANI pairs
# "max_distance" (most conservative), "min_ani", or a fixed numeric between 0 and 1
impute_mode <- "max_distance"
# helper: strip file extensions so IDs match metadata
clean_label <- function(x) {
x <- basename(x)
sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
}
# Load & normalize fastani table
fastani_raw <- readr::read_csv(fastani_path, show_col_types = FALSE)
fastani <- fastani_raw %>%
dplyr::rename(
reference = dplyr::any_of(c("reference", "ref", "Reference")),
query = dplyr::any_of(c("query", "querry", "Qry", "Query")),
ani = dplyr::any_of(c("ani", "ANI", "ani_1", "ANI_1"))
) %>%
dplyr::select(reference, query, ani) %>%
dplyr::filter(!is.na(reference), !is.na(query), reference != query)
# Normalize ANI to [0,1] if needed
ani_max <- max(fastani$ani, na.rm = TRUE)
if (is.finite(ani_max) && ani_max > 1) {
fastani <- fastani %>% dplyr::mutate(ani = ani / 100)
}
# Clean IDs to remove extensions
fastani <- fastani %>%
dplyr::mutate(
reference = clean_label(reference),
query = clean_label(query)
) %>%
dplyr::filter(reference != query)
# Remove duplicate pairs (keep the max ANI per pair)
fastani <- fastani %>%
dplyr::group_by(reference, query) %>%
dplyr::summarise(ani = max(ani), .groups = "drop")
# Optionally drop genomes (IDs without suffix)
if (length(drop_ids) > 0) {
fastani <- fastani %>%
dplyr::filter(!reference %in% drop_ids, !query %in% drop_ids)
}
# Build a symmetric ANI matrix with diag = 1 (dedup-safe)
build_ani_matrix <- function(df) {
genomes <- sort(unique(c(df$reference, df$query)))
both <- dplyr::bind_rows(
df %>% dplyr::transmute(reference, query, ani),
df %>% dplyr::transmute(reference = query, query = reference, ani)
) %>%
dplyr::distinct(reference, query, .keep_all = TRUE)
ani_mat <- both %>%
tidyr::complete(reference = genomes, query = genomes) %>%
tidyr::pivot_wider(
names_from = query,
values_from = ani,
values_fn = max # <-- ensure no duplicate cells cause errors
) %>%
tibble::column_to_rownames("reference") %>%
as.matrix()
diag(ani_mat) <- 1
# Impute missing if any
if (anyNA(ani_mat)) {
if (impute_mode == "max_distance") {
fill_val <- min(ani_mat, na.rm = TRUE)
} else if (impute_mode == "min_ani") {
fill_val <- min(ani_mat, na.rm = TRUE)
} else if (is.numeric(impute_mode) && impute_mode >= 0 && impute_mode <= 1) {
fill_val <- impute_mode
} else {
fill_val <- min(ani_mat, na.rm = TRUE)
}
ani_mat[is.na(ani_mat)] <- fill_val
}
ani_mat
}
ani_mat <- build_ani_matrix(fastani)
# Build the tree (UPGMA/average as in dRep)
build_tree_from_ani <- function(ani_mat, method = "average") {
dist_mat <- 1 - ani_mat
hc <- hclust(as.dist(dist_mat), method = method)
ape::as.phylo(hc)
}
tree <- build_tree_from_ani(ani_mat)
# De-duplicate metadata by ID and clean ID if needed
metadata_dedup <- metadata %>%
dplyr::mutate(!!id_col := clean_label(.data[[id_col]])) %>% # safe if already clean
dplyr::distinct(.data[[id_col]], .keep_all = TRUE)
# Add metadata to tips (tree labels are already clean IDs)
make_tip_df <- function(tree, metadata_dedup, id_col) {
stopifnot(id_col %in% names(metadata_dedup))
tibble::tibble(label = tree$tip.label) %>%
dplyr::left_join(metadata_dedup, by = setNames(id_col, "label")) %>%
dplyr::mutate(label_clean = label) # keep a consistent column if you use it in labels
}
tip_df <- make_tip_df(tree, metadata_dedup, id_col = id_col)
# Helper for ANI axis labels (unchanged)
ani_axis <- function(p, tree, show_threshold = NULL) {
df <- ggtree::fortify(tree)
max_x <- max(df$x[df$isTip])
p <- p +
scale_x_continuous(
labels = function(x) round(100 * (1 - (max_x - x)), 1)
) +
coord_cartesian(xlim = c(0, max_x + 0.005)) +
labs(x = "Average Nucleotide Identity (ANI, %)") +
theme_tree2()
if (!is.null(show_threshold)) {
thr_delta <- 1 - (show_threshold / 100)
p <- p + geom_vline(xintercept = max_x - thr_delta,
linetype = "dashed", color = "red", size = 0.8)
}
p
}
# - BASIC PLOT --
plot_tree_basic <- function(tree, tip_df, color_by = NULL, label_tips = TRUE,
point_size = 2.5, show_threshold = NULL) {
# ensure completeness numeric for continuous gradients
if ("completeness" %in% names(tip_df)) {
tip_df <- tip_df %>% dplyr::mutate(completeness = as.numeric(completeness))
}
p <- ggtree(tree, size = 0.8)
p <- p %<+% tip_df
if (!is.null(color_by) && color_by %in% names(tip_df)) {
p <- p + geom_tippoint(aes(color = !!rlang::sym(color_by)), size = point_size)
# Palette/scale logic
if (identical(color_by, "source") && exists("source_colors")) {
p <- p + scale_color_manual(values = source_colors, name = "Source", drop = FALSE)
} else if (identical(color_by, "host_order") && exists("host_order_colors")) {
p <- p + scale_color_manual(values = host_order_colors, name = "Host order", drop = FALSE)
} else if (is.numeric(tip_df[[color_by]])) {
p <- p + scale_color_gradient(low = "white", high = "#08306B",
name = paste0(color_by, " (%)"))
}
} else {
p <- p + geom_tippoint(size = point_size)
}
if (label_tips) {
lab_col <- if ("label_clean" %in% names(tip_df)) "label_clean" else "label"
p <- p + geom_tiplab(size = 3, hjust = -0.1, aes(label = .data[[lab_col]]))
}
ani_axis(p, tree, show_threshold = show_threshold)
}
# - MULTI-AESTHETIC PLOT --
plot_tree_multi <- function(tree, tip_df,
color_by = NULL,
shape_by = NULL,
size_by = NULL,
label_tips = TRUE,
show_threshold = NULL) {
if ("completeness" %in% names(tip_df)) {
tip_df <- tip_df %>% dplyr::mutate(completeness = as.numeric(completeness))
}
p <- ggtree(tree, size = 0.8)
p <- p %<+% tip_df
aes_map <- aes()
if (!is.null(color_by) && color_by %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(color = !!rlang::sym(color_by)))
if (!is.null(shape_by) && shape_by %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(shape = !!rlang::sym(shape_by)))
if (!is.null(size_by) && size_by %in% names(tip_df)) aes_map <- modifyList(aes_map, aes(size = !!rlang::sym(size_by)))
p <- p + geom_tippoint(aes_map, stroke = 0.3)
if (!is.null(color_by) && color_by %in% names(tip_df)) {
if (identical(color_by, "source") && exists("source_colors")) {
p <- p + scale_color_manual(values = source_colors, name = "Source", drop = FALSE)
} else if (identical(color_by, "host_order") && exists("host_order_colors")) {
p <- p + scale_color_manual(values = host_order_colors, name = "Host order", drop = FALSE)
} else if (is.numeric(tip_df[[color_by]])) {
p <- p + scale_color_gradient(low = "white", high = "#08306B",
name = paste0(color_by, " (%)"))
}
}
if (label_tips) {
lab_col <- if ("label_clean" %in% names(tip_df)) "label_clean" else "label"
p <- p + geom_tiplab(size = 3, hjust = -0.1, aes(label = .data[[lab_col]]))
}
ani_axis(p, tree, show_threshold = show_threshold)
}
p_source <- plot_tree_basic(tree, tip_df, color_by = "source", label_tips = TRUE,
show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

p_host <- plot_tree_basic(tree, tip_df, color_by = "host_species",
label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

p_host_order <- plot_tree_basic(tree, tip_df, color_by = "host_order",
label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

p_continent <- plot_tree_basic(tree, tip_df, color_by = "continent",
label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

# Color tips by completeness (0–100; 100 = dark blue, 0 = white)
p_comp <- plot_tree_basic(tree, tip_df, color_by = "completeness",
label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

p_multi <- plot_tree_multi(
tree, tip_df,
color_by = "host_order",
size_by = "completeness",
label_tips = TRUE,
show_threshold = 99.5
)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
Warning: Removed 90 rows containing missing values or values outside the scale range (`geom_point_g_gtree()`).

ggsave("plots/tree_by_source.png", p_source, width = 10, height = 8, dpi = 300)
ggsave("plots/tree_by_host.png", p_host, width = 10, height = 8, dpi = 300)
ggsave("plots/tree_combo.png", p_combo, width = 12, height = 9, dpi = 300)
ggsave("plots/tree_full_heatmap.png", p_full, width = 12, height = 10, dpi = 300)