Nº of MAGs with VFs
# AMR presence/absence
vf_presence <- genome_annotations %>%
filter(!is.na(vf)) %>%
dplyr::select(genome, vf) %>%
distinct()
#Add the host_type info
vf_with_host_type <- vf_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, host_type),
by = c("genome" = "ID")
)
# Count how many MAGs in each vf
vf_mag_counts <- vf_with_host_type %>%
group_by(host_type, vf) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per host_type (except the outlier)
total_mags_per_host_type <- genome_metadata %>%
group_by(host_type) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each host_type in each vf
vf_mag_proportions <- vf_mag_counts %>%
left_join(total_mags_per_host_type, by = "host_type") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)
Statistical testing of MAG proportions (Fisher)
fisher_results <- vf_mag_proportions %>%
dplyr::select(vf, host_type, n_mags, absent) %>%
pivot_wider(
names_from = host_type,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_animal, absent_animal,
n_mags_human, absent_human),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_animal = n_mags_animal / (n_mags_animal + absent_animal),
prop_human = n_mags_human / (n_mags_human + absent_human),
diff_prop = prop_human - prop_animal
)
fisher_results%>% filter(p_adj < 0.05)
# A tibble: 200 × 12
vf n_mags_animal n_mags_human n_mags_NA absent_animal absent_human absent_NA p_value p_adj prop_animal
<chr> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
1 VF0003 41 8 91 43 34 -90 0.00173 0.00397 0.488
2 VF0014 40 8 89 44 34 -88 0.00189 0.00397 0.476
3 VF0028 38 6 57 46 36 -56 0.000651 0.00397 0.452
4 VF0033 41 8 91 43 34 -90 0.00173 0.00397 0.488
5 VF0043 14 1 38 70 41 -37 0.0197 0.0354 0.167
6 VF0044 41 8 91 43 34 -90 0.00173 0.00397 0.488
7 VF0052 41 8 91 43 34 -90 0.00173 0.00397 0.488
8 VF0056 40 8 91 44 34 -90 0.00189 0.00397 0.476
9 VF0062 40 8 91 44 34 -90 0.00189 0.00397 0.476
10 VF0072 41 8 91 43 34 -90 0.00173 0.00397 0.488
# ℹ 190 more rows
# ℹ 2 more variables: prop_human <dbl>, diff_prop <dbl>
Building a virulence factor abundance table
vf_data <- genome_annotations %>%
dplyr::select(genome, vf, vf_type)
vf_abundance <- genome_annotations %>%
dplyr::select(genome, vf) %>%
group_by(genome, vf) %>%
summarise(count = n(), .groups="drop") %>%
tidyr::pivot_wider(
names_from = vf,
values_from = count,
values_fill = 0
)
dim(vf_abundance)
[1] 140 361
vf_cluster <- vf_abundance %>%
dplyr::full_join(genome_metadata, by= join_by(genome == ID))
# make into a matrix
vf_matrix <- vf_abundance %>%
column_to_rownames("genome") %>%
as.matrix()
# Unscaled abundance heatmap
pheatmap(vf_matrix,
color = viridis(100, option = "viridis"),
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)

# For scaled heatmap -> remove zero variance columns
col_vars <- apply(vf_matrix, 2, var, na.rm = TRUE)
vf_matrix_filtered <- vf_matrix[, col_vars > 0 & !is.na(col_vars)]
# Scale the filtered matrix
vf_scaled <- scale(vf_matrix_filtered, center = TRUE, scale = TRUE)
# Check for any remaining Inf/NaN values
if(any(!is.finite(vf_scaled))) {
vf_scaled[!is.finite(vf_scaled)] <- 0 # Replace Inf/NaN with 0
}
# Scaled heatmap
pheatmap(vf_scaled,
color = viridis(100, option = "viridis"),
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 5,
border_color = NA
)

Virulence presence/absence heatmap
vf_presence <- vf_abundance %>%
column_to_rownames("genome") %>%
mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
as.matrix()
pheatmap(
vf_presence,
cluster_rows = FALSE,
cluster_cols = TRUE,
color = c("white", "darkblue"),
legend_breaks = c(0,1),
legend_labels = c("Absent", "Present"),
border_color = NA
)

Virulence presence/absence PERMANOVA
# remove zero-variance KOs
vf_pa_nz <- vf_presence[, colSums(vf_presence) > 0 & colSums(vf_presence) < nrow(vf_presence)]
# Determine the common genomes
common_ids <- base::intersect(rownames(vf_pa_nz), genome_metadata$ID)
# Report what will be kept/dropped
message("# common: ", length(common_ids))
# common: 50
message("# in KEGG only: ", length(setdiff(rownames(amr_pa_nz), genome_metadata$ID)))
# in KEGG only: 0
message("# in metadata only: ", length(setdiff(genome_metadata$ID, rownames(vf_pa_nz))))
# in metadata only: 77
# Subset to the intersection (and keep order identical)
vf_pa_nz <- vf_pa_nz[common_ids, , drop = FALSE]
meta <- genome_metadata %>%
dplyr::filter(ID %in% common_ids) %>%
dplyr::distinct(ID, .keep_all = TRUE) %>%
tibble::column_to_rownames("ID") %>%
.[common_ids, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))
# VERY IMPORTANT: enforce same order
meta <- meta[rownames(vf_pa_nz), ]
stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))
# Prepare variables for PERMANOVA
required_vars <- c("genome_size", "completeness", "host_type")
# Coerce types as needed
meta <- meta %>%
dplyr::mutate(
genome_size = as.numeric(genome_size),
completeness = as.numeric(completeness),
host_type = as.factor(host_type)
)
# Align on complete cases (adonis2 drops NAs otherwise)
ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
vf_pa_nz <- vf_pa_nz[ok, , drop = FALSE]
meta <- meta[ok, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))
# Distance, dispersion, PERMANOVA
vf_dist_pa <- vegan::vegdist(vf_pa_nz, method = "jaccard", binary = TRUE)
disp_pa <- vegan::betadisper(vf_dist_pa, meta$host_type)
anova(disp_pa)
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.006957 0.0069572 1.8452 0.1808
Residuals 47 0.177212 0.0037705
vegan::adonis2(
vf_dist_pa ~ genome_size + completeness + host_type,
data = meta,
permutations = 999,
by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
vegan::adonis2(formula = vf_dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.06177 0.03364 1.7091 0.072 .
completeness 1 0.08513 0.04637 2.3557 0.015 *
host_type 1 0.06348 0.03458 1.7566 0.047 *
Residual 45 1.62621 0.88580
Total 48 1.83586 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Virulence presence/absence PCoA
pcoa_pa <- cmdscale(vf_dist_pa, eig = TRUE, k = 2)
variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)
pcoa_df <- data.frame(
ID = rownames(vf_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
host_type = meta$host_type
) %>%
left_join(metadata_with_cluster, by = "ID")
pcoa_vf_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
geom_point(size = 2) +
scale_color_manual(values = host_type_colors, name = "host_type") +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_vf_pa

ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
geom_point(size = 2) +
scale_color_manual(values = host_order_colors, na.value = "grey70")+
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by host order)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by completeness)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by genome_size)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
scale_color_manual(values = country_palette, na.value = "grey70")+
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by country)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by continent)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

pcoa_vf_pa_cluster <- ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by functional cluster)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_vf_pa_cluster
