Nº of MAGs with VFs
# AMR presence/absence
vf_presence <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>% # remove weird MAG
filter(!is.na(vf)) %>%
dplyr::select(mag_id, vf) %>%
distinct()
#Add the source info
vf_with_source <- vf_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
# Count how many MAGs in each vf
vf_mag_counts <- vf_with_source %>%
group_by(source, vf) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
filter(ID != "GCA_015060925.1") %>%
group_by(source) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each source in each vf
vf_mag_proportions <- vf_mag_counts %>%
left_join(total_mags_per_source, by = "source") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)
Statistical testing of MAG proportions (Fisher)
fisher_results <- vf_mag_proportions %>%
dplyr::select(vf, source, n_mags, absent) %>%
pivot_wider(
names_from = source,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_EHI, absent_EHI,
n_mags_GTDB, absent_GTDB),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_EHI = n_mags_EHI / (n_mags_EHI + absent_EHI),
prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
diff_prop = prop_GTDB - prop_EHI
)
fisher_results%>% filter(p_adj < 0.05)
# A tibble: 1 × 10
vf n_mags_EHI n_mags_GTDB absent_EHI absent_GTDB p_value p_adj prop_EHI prop_GTDB diff_prop
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 VF0369 2 39 23 29 0.0000128 0.00443 0.08 0.574 0.494
Building a virulence factor abundance table
vf_data <- genome_annotations %>%
dplyr::select(mag_id, vf, vf_type)
vf_abundance <- genome_annotations %>%
dplyr::select(mag_id, vf) %>%
group_by(mag_id, vf) %>%
summarise(count = n(), .groups="drop") %>%
tidyr::pivot_wider(
names_from = vf,
values_from = count,
values_fill = 0
)
dim(vf_abundance)
[1] 93 347
vf_cluster <- vf_abundance %>%
dplyr::full_join(genome_metadata, by= join_by(mag_id == ID))
# make into a matrix
vf_matrix <- vf_abundance %>%
column_to_rownames("mag_id") %>%
as.matrix()
# Unscaled abundance heatmap
pheatmap(vf_matrix,
color = viridis(100, option = "viridis"),
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)

# For scaled heatmap -> remove zero variance columns
col_vars <- apply(vf_matrix, 2, var, na.rm = TRUE)
vf_matrix_filtered <- vf_matrix[, col_vars > 0 & !is.na(col_vars)]
# Scale the filtered matrix
vf_scaled <- scale(vf_matrix_filtered, center = TRUE, scale = TRUE)
# Check for any remaining Inf/NaN values
if(any(!is.finite(vf_scaled))) {
vf_scaled[!is.finite(vf_scaled)] <- 0 # Replace Inf/NaN with 0
}
# Scaled heatmap
pheatmap(vf_scaled,
color = viridis(100, option = "viridis"),
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 5,
border_color = NA
)

Virulence presence/absence heatmap
vf_presence <- vf_abundance %>%
column_to_rownames("mag_id") %>%
mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
as.matrix()
pheatmap(
vf_presence,
cluster_rows = FALSE,
cluster_cols = TRUE,
color = c("white", "darkblue"),
legend_breaks = c(0,1),
legend_labels = c("Absent", "Present"),
border_color = NA
)

Virulence presence/absence PERMANOVA
# remove zero-variance KOs
vf_pa_nz <- vf_presence[, colSums(vf_presence) > 0 & colSums(vf_presence) < nrow(vf_presence)]
meta <- genome_metadata %>%
filter(ID %in% rownames(vf_pa_nz)) %>%
column_to_rownames("ID")
# VERY IMPORTANT: enforce same order
meta <- meta[rownames(vf_pa_nz), ]
stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))
vf_dist_pa <- vegdist(vf_pa_nz, method = "jaccard", binary = TRUE)
# dispersion check
disp_pa <- betadisper(vf_dist_pa, meta$source)
anova(disp_pa)
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.021944 0.0219441 8.1762 0.005263 **
Residuals 91 0.244235 0.0026839
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA
adonis2(
vf_dist_pa ~ genome_size +completeness+ source,
data = meta,
permutations = 999, by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = vf_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.08815 0.03397 3.3817 0.002 **
completeness 1 0.01895 0.00731 0.7272 0.785
source 1 0.05948 0.02292 2.2818 0.006 **
Residual 89 2.31982 0.89413
Total 92 2.59450 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Virulence relative abundance PERMANOVA
vf_counts <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>%
filter(!is.na(vf)) %>%
dplyr::count(mag_id, vf) %>%
pivot_wider(names_from = vf, values_from = n, values_fill = 0) %>%
filter(rowSums(dplyr::select(., -mag_id)) > 0) # Removes MAGs with no vf genes found
# Fixed normalization
vf_rel <- vf_counts %>%
column_to_rownames("mag_id")
vf_rel <- vf_rel / rowSums(vf_rel) # Each row sums to 1
# Remove zero variance
vf_rel_nz <- vf_rel[, apply(vf_rel, 2, sd) > 0]
# Distance matrix from normalized data
vf_dist <- vegdist(vf_rel_nz, method = "bray") # or euclidean
# Add metadata
vf_rel_nz_meta <- vf_rel_nz %>%
rownames_to_column("ID") %>%
left_join(genome_metadata, by = "ID")
#Beta dispersion test
dispersion <- betadisper(vf_dist, vf_rel_nz_meta$source)
anova(dispersion)
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.007148 0.0071483 16.317 0.0001117 ***
Residuals 91 0.039866 0.0004381
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(vf_dist ~ source,
data = vf_rel_nz_meta,
permutations = 999)
print(permanova_result)
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = vf_dist ~ source, data = vf_rel_nz_meta, permutations = 999)
Df SumOfSqs R2 F Pr(>F)
Model 1 0.05742 0.08119 8.0412 0.001 ***
Residual 91 0.64983 0.91881
Total 92 0.70725 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test accounting for genome size
adonis2(
vf_dist ~ genome_size + source,
data = vf_rel_nz_meta,
permutations = 999,
by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = vf_dist ~ genome_size + source, data = vf_rel_nz_meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.10339 0.14619 17.0294 0.001 ***
source 1 0.00984 0.01392 1.6211 0.079 .
Residual 90 0.54644 0.77262
Total 92 0.70725 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Virulence presence/absence PCoA
pcoa_pa <- cmdscale(vf_dist_pa, eig = TRUE, k = 2)
variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)
pcoa_df <- data.frame(
ID = rownames(vf_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
source = meta$source
) %>%
left_join(metadata_with_cluster, by = "ID")
pcoa_vf_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors, name = "Source") +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_vf_pa

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by completeness)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by genome_size)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
scale_color_manual(values = country_palette, na.value = "grey70")+
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by country)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by continent)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

pcoa_vf_pa_cluster <- ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs (colored by functional cluster)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_vf_pa_cluster

Virulence relative abundance PCoA
pcoa_res <- cmdscale(vf_dist, k = 2, eig = TRUE)
# Calculate variance explained from PCoA eigenvalues
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)
pcoa_df <- data.frame(
PC1 = pcoa_res$points[,1],
PC2 = pcoa_res$points[,2],
ID = rownames(vf_rel_nz)
) %>%
left_join(metadata_with_cluster, by = "ID")
ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
scale_color_manual(values = source_colors) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs colored by GIFT cluster",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs colored by GIFT cluster",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs colored by GIFT country",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of VF annotations across MAGs colored by GIFT continent",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ALDEX2
# Prepare count matrix for ALDEx2 (raw counts, not normalized)
vf_counts_aldex <- vf_counts %>%
column_to_rownames("mag_id")
# Transpose for ALDEx2 (features as rows, samples as columns)
vf_counts_t <- t(vf_counts_aldex)
# Remove features with zero counts across all samples
vf_counts_t <- vf_counts_t[rowSums(vf_counts_t) > 0, ]
# Create conditions vector matching column order
conditions <- genome_metadata %>%
filter(ID %in% colnames(vf_counts_t)) %>%
arrange(match(ID, colnames(vf_counts_t))) %>%
pull(source)
# Run ALDEx2
set.seed(123)
aldex_vf <- aldex.clr(vf_counts_t, conditions, mc.samples = 128, denom = "all")
conditions vector supplied
operating in serial mode
computing center with all features
aldex_vf_test <- aldex.ttest(aldex_vf, paired.test = FALSE)
aldex_vf_effect <- aldex.effect(aldex_vf, CI = TRUE)
operating in serial mode
sanity check complete
rab.all complete
rab.win complete
rab of samples complete
within sample difference calculated
between group difference calculated
group summaries calculated
unpaired effect size calculated
summarizing output
# Combine results
aldex_vf_results <- data.frame(aldex_vf_test, aldex_vf_effect) %>%
rownames_to_column("vf_gene") %>%
arrange(wi.eBH)
# Significant vf genes (FDR < 0.05)
significant_vf <- aldex_vf_results %>%
filter(wi.eBH < 0.05) %>%
mutate(enriched_in = ifelse(effect > 0,
levels(factor(conditions))[2],
levels(factor(conditions))[1]))
cat("\nSignificant vf genes:", nrow(significant_vf), "\n")
Significant vf genes: 0
[1] vf_gene we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw diff.win
[11] effect effect.low effect.high overlap enriched_in
<0 rows> (o 0- extensión row.names)
# Visualization
ggplot(aldex_vf_results, aes(x = effect, y = -log10(wi.eBH))) +
geom_point(aes(color = wi.eBH < 0.05), alpha = 0.6) +
scale_color_manual(values = c("grey", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
theme_minimal() +
labs(
title = "ALDEx2 results for vf genes",
x = "Effect size",
y = "-log10(BH-adjusted p-value)",
color = "Significant"
)
