Chapter 7 Antibiotic resistance analysis

7.0.1 Nº of MAGs with AMRs

# AMR presence/absence
amr_presence <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1") %>%   # remove weird MAG
  filter(resistance_type == "AMR") %>%
  dplyr::select(mag_id, resistance_target) %>%
  distinct() 

#Add the source info
amr_with_source <- amr_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, source),
    by = c("mag_id" = "ID")
  )

# Count how many MAGs in each AMR
amr_mag_counts <- amr_with_source %>%
  group_by(source, resistance_target) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
  filter(ID != "GCA_015060925.1") %>%
  group_by(source) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each source in each AMR
amr_mag_proportions <- amr_mag_counts %>%
  left_join(total_mags_per_source, by = "source") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

7.0.1.1 Statistical testing of MAG proportions

fisher_results <- amr_mag_proportions %>%
  dplyr::select(resistance_target, source, n_mags, absent) %>%
  pivot_wider(
    names_from = source,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_EHI,  absent_EHI,
          n_mags_GTDB, absent_GTDB),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_EHI  = n_mags_EHI  / (n_mags_EHI  + absent_EHI),
    prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
    diff_prop = prop_GTDB - prop_EHI
  )

fisher_results%>% filter(p_adj < 0.05)

# A tibble: 0 × 10
# ℹ 10 variables: resistance_target <chr>, n_mags_EHI <int>, n_mags_GTDB <int>, absent_EHI <int>, absent_GTDB <int>, p_value <dbl>,
#   p_adj <dbl>, prop_EHI <dbl>, prop_GTDB <dbl>, diff_prop <dbl>

7.1 Building an AMR abundance table

# select the AMR annotations
amr_abundance <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  group_by(mag_id, resistance_target) %>%
  summarise(count = n(), .groups="drop") %>%
  tidyr::pivot_wider(
    names_from = resistance_target,
    values_from = count,
    values_fill = 0
  )

amr_cluster <- amr_abundance %>%
  dplyr::full_join(metadata_with_cluster, by= join_by(mag_id == ID))

amr_cluster %>%
  dplyr::select("BETA-LACTAM", "MACROLIDE", mag_id, cluster) %>%
  group_by( cluster) %>%
  summarise(mean_beta_lactam = mean(`BETA-LACTAM`, na.rm = TRUE),
            mean_macrolide = mean(MACROLIDE, na.rm = TRUE))

# A tibble: 4 × 3
  cluster mean_beta_lactam mean_macrolide
  <fct>              <dbl>          <dbl>
1 1                   9.64           14.9
2 2                  10.6            16.1
3 3                   9.5            14.9
4 4                  10.4            14.4

amr_matrix <- amr_abundance %>%
  column_to_rownames("mag_id") %>%
  as.matrix()

# Abundance heatmap
min_val <- min(amr_matrix, na.rm = TRUE)
max_val <- max(amr_matrix, na.rm = TRUE)

colors <- viridis(100, option = "viridis")

breaks <- seq(min_val, max_val, length.out = 101)   

pheatmap( amr_matrix,
  color = colors,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

#Scaled abundance heatmap
amr_scaled <- scale(amr_matrix, center = TRUE, scale = TRUE)

min_val <- min(amr_scaled, na.rm = TRUE)
max_val <- max(amr_scaled, na.rm = TRUE)

colors <- viridis(100, option = "viridis")

breaks <- seq(min_val, max_val, length.out = 101)   



pheatmap( amr_scaled,
  color = colors,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

amr_presence <- amr_abundance %>%
  column_to_rownames("mag_id") %>%   
  mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
  as.matrix()


pheatmap(
  amr_presence,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     # white = absent, black = present
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

pheatmap(
  amr_presence,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     # white = absent, black = present
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

7.2 PERMANOVA relative abundance

amr_counts <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1") %>%
  filter(resistance_type == "AMR") %>%
  dplyr::count(mag_id, resistance_target) %>%            
  pivot_wider(names_from = resistance_target, values_from = n, values_fill = 0) %>%
  filter(rowSums(dplyr::select(., -mag_id)) > 0) # Removes MAGs with no AMR genes found

# Fixed normalization
amr_rel <- amr_counts %>%
  column_to_rownames("mag_id")
amr_rel <- amr_rel / rowSums(amr_rel)  # Each row sums to 1

# Remove zero variance
amr_rel_nz <- amr_rel[, apply(amr_rel, 2, sd) > 0]

# Distance matrix from normalized data
amr_dist <- vegdist(amr_rel_nz, method = "bray")  # or euclidean

# Add metadata
amr_rel_nz_meta <- amr_rel_nz %>%
  rownames_to_column("ID") %>%
  left_join(genome_metadata, by = "ID")

#Beta dispersion test
dispersion <- betadisper(amr_dist, amr_rel_nz_meta$source)
anova(dispersion)

Analysis of Variance Table

Response: Distances
          Df   Sum Sq    Mean Sq F value    Pr(>F)    
Groups     1 0.001863 0.00186304   13.53 0.0003971 ***
Residuals 91 0.012530 0.00013769                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# PERMANOVA test
permanova_result <- adonis2(amr_dist ~ source, 
                            data = amr_rel_nz_meta, 
                            permutations = 999)
print(permanova_result)

Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = amr_dist ~ source, data = amr_rel_nz_meta, permutations = 999)
         Df SumOfSqs      R2      F Pr(>F)    
Model     1 0.010991 0.07336 7.2039  0.001 ***
Residual 91 0.138842 0.92664                  
Total    92 0.149834 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# PERMANOVA test accounting for genome size
adonis2(
  amr_dist ~ genome_size + source,
  data = amr_rel_nz_meta,
  permutations = 999,
  by = "margin"
)

Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = amr_dist ~ genome_size + source, data = amr_rel_nz_meta, permutations = 999, by = "margin")
            Df SumOfSqs      R2       F Pr(>F)    
genome_size  1 0.022563 0.15058 17.4633  0.001 ***
source       1 0.002794 0.01865  2.1629  0.041 *  
Residual    90 0.116280 0.77606                   
Total       92 0.149834 1.00000                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.3 PERMANOVA presence/absence

# remove zero-variance KOs
amr_pa_nz <- amr_presence[, colSums(amr_presence) > 0 & colSums(amr_presence) < nrow(amr_presence)]

meta <- genome_metadata %>%
  filter(ID %in% rownames(amr_pa_nz)) %>%
  column_to_rownames("ID")

# VERY IMPORTANT: enforce same order
meta <- meta[rownames(amr_pa_nz), ]

stopifnot(identical(rownames(meta), rownames(amr_pa_nz)))


amr_dist_pa <- vegdist(amr_pa_nz, method = "jaccard", binary = TRUE)

# dispersion check
disp_pa <- betadisper(amr_dist_pa, meta$source)

Warning in betadisper(amr_dist_pa, meta$source): some squared distances are negative and changed to zero

anova(disp_pa)

Analysis of Variance Table

Response: Distances
          Df Sum Sq  Mean Sq F value Pr(>F)
Groups     1 0.0223 0.022297  1.9946 0.1613
Residuals 91 1.0173 0.011179

# PERMANOVA
adonis2(
  amr_dist_pa ~ genome_size +completeness+ source,
  data = meta,
  permutations = 999, by = "margin"
)

Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = amr_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)  
genome_size   1   0.1634 0.03011 2.8778  0.023 *
completeness  1   0.0738 0.01359 1.2988  0.262  
source        1   0.0662 0.01220 1.1655  0.332  
Residual     89   5.0549 0.93132                
Total        92   5.4277 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7.4 PCoA presence/absence

pcoa_pa <- cmdscale(amr_dist_pa, eig = TRUE, k = 2)

variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)


pcoa_df <- data.frame(
  ID = rownames(amr_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  source = meta$source
) %>%
  left_join(metadata_with_cluster, by = "ID")


pcoa_amr_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
  geom_point(size = 2) +
  scale_color_manual(values = source_colors, name = "Source") +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_amr_pa

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by completeness)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by genome_size)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by country)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by continent)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by functional cluster)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

7.5 PCoA relative abundance

pcoa_res <- cmdscale(amr_dist, k = 2, eig = TRUE)

# Calculate variance explained from PCoA eigenvalues
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)

pcoa_df <- data.frame(
  PC1 = pcoa_res$points[,1], 
  PC2 = pcoa_res$points[,2], 
  ID = rownames(amr_rel_nz)
) %>%
  left_join(metadata_with_cluster, by = "ID")


ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
  geom_point(size = 2) +
  scale_color_manual(values = source_colors) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs colored by GIFT clustering",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

cat("MAGs with AMR genes:", nrow(amr_rel_nz), "\n")

MAGs with AMR genes: 93

cat("Total AMR genes detected:", ncol(amr_rel_nz), "\n")

Total AMR genes detected: 47

7.6 DESEq2

library(DESeq2)

# Prepare the count matrix (AMR_Targets as rows, MAGs as columns)
# Ensure only integer columns are kept
counts_mtx <- amr_counts %>%
  column_to_rownames("mag_id") %>%
  t() 

# Prepare metadata (Must match column names of counts_mtx)
metadata <- genome_metadata %>%
  filter(ID %in% colnames(counts_mtx)) %>%
  column_to_rownames("ID")

# Ensure order matches exactly
metadata <- metadata[colnames(counts_mtx), , drop = FALSE]

#Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts_mtx,
                              colData = metadata,
                              design = ~ source)

Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to factors

#Run initial steps manually instead of using DESeq()
dds <- estimateSizeFactors(dds)
dds <- estimateDispersionsGeneEst(dds)
dispersions(dds) <- mcols(dds)$dispGeneEst # Force the use of gene-wise estimates

# Run the Wald test
dds <- nbinomWaldTest(dds)

# save results (Wildlife vs Human)
res <- results(dds, contrast=c("source", "EHI", "GTDB"), alpha=0.05)


summary(res)


out of 47 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 0, 0%
LFC < 0 (down)     : 1, 2.1%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

res_df <-res %>% as.data.frame %>% rownames_to_column("AMR_Target") %>% as_tibble

res_df <- filter(res_df, padj<0.05)

7.7 Testing proportions (Fisher)

amr_binary <- amr_counts %>%
  left_join(genome_metadata %>% dplyr::select(ID, source), by = c("mag_id" = "ID")) %>%
  # Convert all numeric columns to 0 or 1
  mutate(across(where(is.numeric), ~ ifelse(.x > 0, 1, 0)))

#Function to run Fischer test for each gene
run_fisher <- function(gene_name, df) {
  tab <- table(df$source, df[[gene_name]])
  
  # Only run if the table is actually 2x2 
  if(ncol(tab) == 2) {
    test <- fisher.test(tab)
    return(tidy(test) %>% mutate(AMR_Target = gene_name))
  } else {
    return(NULL)
  }
}

# Apply the function to all AMR columns
amr_names <- colnames(amr_counts)[-1] # everything except mag_id

fisher_results <- map_df(amr_names, ~ run_fisher(.x, amr_binary))

# Adjust for multiple testing (FDR)
fisher_results <- fisher_results %>%
  mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
  #filter(p.adj < 0.05) %>%
  arrange(p.adj)

print(fisher_results)

# A tibble: 25 × 8
   estimate p.value conf.low conf.high method                             alternative AMR_Target                                 p.adj
      <dbl>   <dbl>    <dbl>     <dbl> <chr>                              <chr>       <chr>                                      <dbl>
 1    0     0.00433   0          0.516 Fisher's Exact Test for Count Data two.sided   OLEANDOMYCIN                               0.108
 2    6.13  0.0428    0.814     72.5   Fisher's Exact Test for Count Data two.sided   AMINOGLYCOSIDE                             0.309
 3    0.348 0.0495    0.108      1.13  Fisher's Exact Test for Count Data two.sided   AZITHROMYCIN/CLINDAMYCIN/ERYTHROMYCIN/STR… 0.309
 4    7.88  0.0350    1.10     347.    Fisher's Exact Test for Count Data two.sided   CLINDAMYCIN/MACROLIDE/STREPTOGRAMIN        0.309
 5    0.425 0.104     0.137      1.34  Fisher's Exact Test for Count Data two.sided   CEPHALOSPORIN                              0.522
 6    5.69  0.175     0.284    348.    Fisher's Exact Test for Count Data two.sided   BLEOMYCIN                                  0.632
 7  Inf     0.269     0.0697   Inf     Fisher's Exact Test for Count Data two.sided   TRIMETHOPRIM                               0.632
 8    0.555 0.278     0.178      1.80  Fisher's Exact Test for Count Data two.sided   AMIKACIN/KANAMYCIN/TOBRAMYCIN              0.632
 9    3.62  0.278     0.457    167.    Fisher's Exact Test for Count Data two.sided   CLARITHROMYCIN                             0.632
10    3.62  0.278     0.457    167.    Fisher's Exact Test for Count Data two.sided   STREPTOMYCIN                               0.632
# ℹ 15 more rows

7.8 ALDEx2

library(ALDEx2)
# Prepare count matrix for ALDEx2 (raw counts, not normalized)
amr_counts_aldex <- amr_counts %>%
  column_to_rownames("mag_id")

# Transpose for ALDEx2 (features as rows, samples as columns)
amr_counts_t <- t(amr_counts_aldex)

# Remove features with zero counts across all samples
amr_counts_t <- amr_counts_t[rowSums(amr_counts_t) > 0, ]

# Create conditions vector matching column order
conditions <- genome_metadata %>%
  filter(ID %in% colnames(amr_counts_t)) %>%
  arrange(match(ID, colnames(amr_counts_t))) %>%
  pull(source)

# Run ALDEx2
set.seed(123)
aldex_amr <- aldex.clr(amr_counts_t, conditions, mc.samples = 128, denom = "all")

conditions vector supplied

operating in serial mode

computing center with all features

aldex_amr_test <- aldex.ttest(aldex_amr, paired.test = FALSE)
aldex_amr_effect <- aldex.effect(aldex_amr, CI = TRUE)

operating in serial mode

sanity check complete

rab.all  complete

rab.win  complete

rab of samples complete

within sample difference calculated

between group difference calculated

group summaries calculated

unpaired effect size calculated

summarizing output

# Combine results
aldex_amr_results <- data.frame(aldex_amr_test, aldex_amr_effect) %>%
  rownames_to_column("AMR_gene") %>%
  arrange(wi.eBH)

# Significant AMR genes (FDR < 0.05)
significant_amr <- aldex_amr_results %>%
  filter(wi.eBH < 0.05) %>%
  mutate(enriched_in = ifelse(effect > 0, 
                               levels(factor(conditions))[2],
                               levels(factor(conditions))[1]))

cat("\nSignificant AMR genes:", nrow(significant_amr), "\n")


Significant AMR genes: 0

print(significant_amr)

 [1] AMR_gene     we.ep        we.eBH       wi.ep        wi.eBH       rab.all      rab.win.EHI  rab.win.GTDB diff.btw     diff.win    
[11] effect       effect.low   effect.high  overlap      enriched_in 
<0 rows> (o 0- extensión row.names)

# Visualization
ggplot(aldex_amr_results, aes(x = effect, y = -log10(wi.eBH))) +
  geom_point(aes(color = wi.eBH < 0.05), alpha = 0.6) +
  scale_color_manual(values = c("grey", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "ALDEx2 results for AMR genes",
    x = "Effect size",
    y = "-log10(BH-adjusted p-value)",
    color = "Significant"
  )