Nº of MAGs with AMRs
# AMR presence/absence
amr_presence <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>% # remove weird MAG
filter(resistance_type == "AMR") %>%
dplyr::select(mag_id, resistance_target) %>%
distinct()
#Add the source info
amr_with_source <- amr_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
# Count how many MAGs in each AMR
amr_mag_counts <- amr_with_source %>%
group_by(source, resistance_target) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
filter(ID != "GCA_015060925.1") %>%
group_by(source) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each source in each AMR
amr_mag_proportions <- amr_mag_counts %>%
left_join(total_mags_per_source, by = "source") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)
Statistical testing of MAG proportions
fisher_results <- amr_mag_proportions %>%
dplyr::select(resistance_target, source, n_mags, absent) %>%
pivot_wider(
names_from = source,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_EHI, absent_EHI,
n_mags_GTDB, absent_GTDB),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_EHI = n_mags_EHI / (n_mags_EHI + absent_EHI),
prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
diff_prop = prop_GTDB - prop_EHI
)
fisher_results%>% filter(p_adj < 0.05)
# A tibble: 0 × 10
# ℹ 10 variables: resistance_target <chr>, n_mags_EHI <int>, n_mags_GTDB <int>, absent_EHI <int>, absent_GTDB <int>, p_value <dbl>,
# p_adj <dbl>, prop_EHI <dbl>, prop_GTDB <dbl>, diff_prop <dbl>
Building an AMR abundance table
# select the AMR annotations
amr_abundance <- genome_annotations %>%
filter(resistance_type == "AMR") %>%
group_by(mag_id, resistance_target) %>%
summarise(count = n(), .groups="drop") %>%
tidyr::pivot_wider(
names_from = resistance_target,
values_from = count,
values_fill = 0
)
amr_cluster <- amr_abundance %>%
dplyr::full_join(metadata_with_cluster, by= join_by(mag_id == ID))
amr_cluster %>%
dplyr::select("BETA-LACTAM", "MACROLIDE", mag_id, cluster) %>%
group_by( cluster) %>%
summarise(mean_beta_lactam = mean(`BETA-LACTAM`, na.rm = TRUE),
mean_macrolide = mean(MACROLIDE, na.rm = TRUE))
# A tibble: 4 × 3
cluster mean_beta_lactam mean_macrolide
<fct> <dbl> <dbl>
1 1 9.64 14.9
2 2 10.6 16.1
3 3 9.5 14.9
4 4 10.4 14.4
amr_matrix <- amr_abundance %>%
column_to_rownames("mag_id") %>%
as.matrix()
# Abundance heatmap
min_val <- min(amr_matrix, na.rm = TRUE)
max_val <- max(amr_matrix, na.rm = TRUE)
colors <- viridis(100, option = "viridis")
breaks <- seq(min_val, max_val, length.out = 101)
pheatmap( amr_matrix,
color = colors,
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)

#Scaled abundance heatmap
amr_scaled <- scale(amr_matrix, center = TRUE, scale = TRUE)
min_val <- min(amr_scaled, na.rm = TRUE)
max_val <- max(amr_scaled, na.rm = TRUE)
colors <- viridis(100, option = "viridis")
breaks <- seq(min_val, max_val, length.out = 101)
pheatmap( amr_scaled,
color = colors,
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)

amr_presence <- amr_abundance %>%
column_to_rownames("mag_id") %>%
mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
as.matrix()
pheatmap(
amr_presence,
cluster_rows = TRUE,
cluster_cols = TRUE,
color = c("white", "darkblue"), # white = absent, black = present
legend_breaks = c(0,1),
legend_labels = c("Absent", "Present"),
border_color = NA
)

pheatmap(
amr_presence,
cluster_rows = FALSE,
cluster_cols = TRUE,
color = c("white", "darkblue"), # white = absent, black = present
legend_breaks = c(0,1),
legend_labels = c("Absent", "Present"),
border_color = NA
)

PERMANOVA relative abundance
amr_counts <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>%
filter(resistance_type == "AMR") %>%
dplyr::count(mag_id, resistance_target) %>%
pivot_wider(names_from = resistance_target, values_from = n, values_fill = 0) %>%
filter(rowSums(dplyr::select(., -mag_id)) > 0) # Removes MAGs with no AMR genes found
# Fixed normalization
amr_rel <- amr_counts %>%
column_to_rownames("mag_id")
amr_rel <- amr_rel / rowSums(amr_rel) # Each row sums to 1
# Remove zero variance
amr_rel_nz <- amr_rel[, apply(amr_rel, 2, sd) > 0]
# Distance matrix from normalized data
amr_dist <- vegdist(amr_rel_nz, method = "bray") # or euclidean
# Add metadata
amr_rel_nz_meta <- amr_rel_nz %>%
rownames_to_column("ID") %>%
left_join(genome_metadata, by = "ID")
#Beta dispersion test
dispersion <- betadisper(amr_dist, amr_rel_nz_meta$source)
anova(dispersion)
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.001863 0.00186304 13.53 0.0003971 ***
Residuals 91 0.012530 0.00013769
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(amr_dist ~ source,
data = amr_rel_nz_meta,
permutations = 999)
print(permanova_result)
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = amr_dist ~ source, data = amr_rel_nz_meta, permutations = 999)
Df SumOfSqs R2 F Pr(>F)
Model 1 0.010991 0.07336 7.2039 0.001 ***
Residual 91 0.138842 0.92664
Total 92 0.149834 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test accounting for genome size
adonis2(
amr_dist ~ genome_size + source,
data = amr_rel_nz_meta,
permutations = 999,
by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = amr_dist ~ genome_size + source, data = amr_rel_nz_meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.022563 0.15058 17.4633 0.001 ***
source 1 0.002794 0.01865 2.1629 0.041 *
Residual 90 0.116280 0.77606
Total 92 0.149834 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PERMANOVA presence/absence
# remove zero-variance KOs
amr_pa_nz <- amr_presence[, colSums(amr_presence) > 0 & colSums(amr_presence) < nrow(amr_presence)]
meta <- genome_metadata %>%
filter(ID %in% rownames(amr_pa_nz)) %>%
column_to_rownames("ID")
# VERY IMPORTANT: enforce same order
meta <- meta[rownames(amr_pa_nz), ]
stopifnot(identical(rownames(meta), rownames(amr_pa_nz)))
amr_dist_pa <- vegdist(amr_pa_nz, method = "jaccard", binary = TRUE)
# dispersion check
disp_pa <- betadisper(amr_dist_pa, meta$source)
Warning in betadisper(amr_dist_pa, meta$source): some squared distances are negative and changed to zero
Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.0223 0.022297 1.9946 0.1613
Residuals 91 1.0173 0.011179
# PERMANOVA
adonis2(
amr_dist_pa ~ genome_size +completeness+ source,
data = meta,
permutations = 999, by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = amr_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.1634 0.03011 2.8778 0.023 *
completeness 1 0.0738 0.01359 1.2988 0.262
source 1 0.0662 0.01220 1.1655 0.332
Residual 89 5.0549 0.93132
Total 92 5.4277 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PCoA presence/absence
pcoa_pa <- cmdscale(amr_dist_pa, eig = TRUE, k = 2)
variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)
pcoa_df <- data.frame(
ID = rownames(amr_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
source = meta$source
) %>%
left_join(metadata_with_cluster, by = "ID")
pcoa_amr_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors, name = "Source") +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_amr_pa

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs (colored by completeness)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs (colored by genome_size)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
scale_color_manual(values = country_palette, na.value = "grey70")+
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs (colored by country)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs (colored by continent)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs (colored by functional cluster)",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

PCoA relative abundance
pcoa_res <- cmdscale(amr_dist, k = 2, eig = TRUE)
# Calculate variance explained from PCoA eigenvalues
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)
pcoa_df <- data.frame(
PC1 = pcoa_res$points[,1],
PC2 = pcoa_res$points[,2],
ID = rownames(amr_rel_nz)
) %>%
left_join(metadata_with_cluster, by = "ID")
ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs colored by GIFT clustering",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of AMR annotations across MAGs",
x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
)

cat("MAGs with AMR genes:", nrow(amr_rel_nz), "\n")
MAGs with AMR genes: 93
cat("Total AMR genes detected:", ncol(amr_rel_nz), "\n")
Total AMR genes detected: 47
DESEq2
library(DESeq2)
# Prepare the count matrix (AMR_Targets as rows, MAGs as columns)
# Ensure only integer columns are kept
counts_mtx <- amr_counts %>%
column_to_rownames("mag_id") %>%
t()
# Prepare metadata (Must match column names of counts_mtx)
metadata <- genome_metadata %>%
filter(ID %in% colnames(counts_mtx)) %>%
column_to_rownames("ID")
# Ensure order matches exactly
metadata <- metadata[colnames(counts_mtx), , drop = FALSE]
#Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts_mtx,
colData = metadata,
design = ~ source)
Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to factors
#Run initial steps manually instead of using DESeq()
dds <- estimateSizeFactors(dds)
dds <- estimateDispersionsGeneEst(dds)
dispersions(dds) <- mcols(dds)$dispGeneEst # Force the use of gene-wise estimates
# Run the Wald test
dds <- nbinomWaldTest(dds)
# save results (Wildlife vs Human)
res <- results(dds, contrast=c("source", "EHI", "GTDB"), alpha=0.05)
summary(res)
out of 47 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 0, 0%
LFC < 0 (down) : 1, 2.1%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
res_df <-res %>% as.data.frame %>% rownames_to_column("AMR_Target") %>% as_tibble
res_df <- filter(res_df, padj<0.05)
Testing proportions (Fisher)
amr_binary <- amr_counts %>%
left_join(genome_metadata %>% dplyr::select(ID, source), by = c("mag_id" = "ID")) %>%
# Convert all numeric columns to 0 or 1
mutate(across(where(is.numeric), ~ ifelse(.x > 0, 1, 0)))
#Function to run Fischer test for each gene
run_fisher <- function(gene_name, df) {
tab <- table(df$source, df[[gene_name]])
# Only run if the table is actually 2x2
if(ncol(tab) == 2) {
test <- fisher.test(tab)
return(tidy(test) %>% mutate(AMR_Target = gene_name))
} else {
return(NULL)
}
}
# Apply the function to all AMR columns
amr_names <- colnames(amr_counts)[-1] # everything except mag_id
fisher_results <- map_df(amr_names, ~ run_fisher(.x, amr_binary))
# Adjust for multiple testing (FDR)
fisher_results <- fisher_results %>%
mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
#filter(p.adj < 0.05) %>%
arrange(p.adj)
print(fisher_results)
# A tibble: 25 × 8
estimate p.value conf.low conf.high method alternative AMR_Target p.adj
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
1 0 0.00433 0 0.516 Fisher's Exact Test for Count Data two.sided OLEANDOMYCIN 0.108
2 6.13 0.0428 0.814 72.5 Fisher's Exact Test for Count Data two.sided AMINOGLYCOSIDE 0.309
3 0.348 0.0495 0.108 1.13 Fisher's Exact Test for Count Data two.sided AZITHROMYCIN/CLINDAMYCIN/ERYTHROMYCIN/STR… 0.309
4 7.88 0.0350 1.10 347. Fisher's Exact Test for Count Data two.sided CLINDAMYCIN/MACROLIDE/STREPTOGRAMIN 0.309
5 0.425 0.104 0.137 1.34 Fisher's Exact Test for Count Data two.sided CEPHALOSPORIN 0.522
6 5.69 0.175 0.284 348. Fisher's Exact Test for Count Data two.sided BLEOMYCIN 0.632
7 Inf 0.269 0.0697 Inf Fisher's Exact Test for Count Data two.sided TRIMETHOPRIM 0.632
8 0.555 0.278 0.178 1.80 Fisher's Exact Test for Count Data two.sided AMIKACIN/KANAMYCIN/TOBRAMYCIN 0.632
9 3.62 0.278 0.457 167. Fisher's Exact Test for Count Data two.sided CLARITHROMYCIN 0.632
10 3.62 0.278 0.457 167. Fisher's Exact Test for Count Data two.sided STREPTOMYCIN 0.632
# ℹ 15 more rows
ALDEx2
library(ALDEx2)
# Prepare count matrix for ALDEx2 (raw counts, not normalized)
amr_counts_aldex <- amr_counts %>%
column_to_rownames("mag_id")
# Transpose for ALDEx2 (features as rows, samples as columns)
amr_counts_t <- t(amr_counts_aldex)
# Remove features with zero counts across all samples
amr_counts_t <- amr_counts_t[rowSums(amr_counts_t) > 0, ]
# Create conditions vector matching column order
conditions <- genome_metadata %>%
filter(ID %in% colnames(amr_counts_t)) %>%
arrange(match(ID, colnames(amr_counts_t))) %>%
pull(source)
# Run ALDEx2
set.seed(123)
aldex_amr <- aldex.clr(amr_counts_t, conditions, mc.samples = 128, denom = "all")
conditions vector supplied
operating in serial mode
computing center with all features
aldex_amr_test <- aldex.ttest(aldex_amr, paired.test = FALSE)
aldex_amr_effect <- aldex.effect(aldex_amr, CI = TRUE)
operating in serial mode
sanity check complete
rab.all complete
rab.win complete
rab of samples complete
within sample difference calculated
between group difference calculated
group summaries calculated
unpaired effect size calculated
summarizing output
# Combine results
aldex_amr_results <- data.frame(aldex_amr_test, aldex_amr_effect) %>%
rownames_to_column("AMR_gene") %>%
arrange(wi.eBH)
# Significant AMR genes (FDR < 0.05)
significant_amr <- aldex_amr_results %>%
filter(wi.eBH < 0.05) %>%
mutate(enriched_in = ifelse(effect > 0,
levels(factor(conditions))[2],
levels(factor(conditions))[1]))
cat("\nSignificant AMR genes:", nrow(significant_amr), "\n")
Significant AMR genes: 0
[1] AMR_gene we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw diff.win
[11] effect effect.low effect.high overlap enriched_in
<0 rows> (o 0- extensión row.names)
# Visualization
ggplot(aldex_amr_results, aes(x = effect, y = -log10(wi.eBH))) +
geom_point(aes(color = wi.eBH < 0.05), alpha = 0.6) +
scale_color_manual(values = c("grey", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
theme_minimal() +
labs(
title = "ALDEx2 results for AMR genes",
x = "Effect size",
y = "-log10(BH-adjusted p-value)",
color = "Significant"
)
