Chapter 8 Antibiotic resistance analysis

8.0.1 Nº of MAGs with AMRs

# AMR presence/absence
amr_presence <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  dplyr::select(genome, resistance_target) %>%
  distinct() 

#Add the host_type info
amr_with_host_type <- amr_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )

# Count how many MAGs in each AMR
amr_mag_counts <- amr_with_host_type %>%
  group_by(host_type, resistance_target) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per host_type 
total_mags_per_host_type <- genome_metadata %>%
  group_by(host_type) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each host_type in each AMR
amr_mag_proportions <- amr_mag_counts %>%
  left_join(total_mags_per_host_type, by = "host_type") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

8.0.1.1 Statistical testing of MAG proportions

fisher_results <- amr_mag_proportions %>%
  dplyr::select(resistance_target, host_type, n_mags, absent) %>%
  pivot_wider(
    names_from = host_type,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_animal,  absent_animal,
          n_mags_human, absent_human),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_animal  = n_mags_animal  / (n_mags_animal  + absent_animal),
    prop_human = n_mags_human / (n_mags_human + absent_human),
    diff_prop = prop_human - prop_animal
  )
fisher_results%>% filter(p_adj < 0.05) 
# A tibble: 32 × 12
   resistance_target   n_mags_animal n_mags_human n_mags_NA absent_animal absent_human absent_NA p_value   p_adj
   <chr>                       <int>        <int>     <int>         <int>        <int>     <int>   <dbl>   <dbl>
 1 AMIKACIN/KANAMYCIN…            41            8        91            43           34       -90 0.00173 0.00266
 2 AMINOGLYCOSIDE                 41            8        91            43           34       -90 0.00173 0.00266
 3 AVILAMYCIN                     41            8        91            43           34       -90 0.00173 0.00266
 4 AZITHROMYCIN/ERYTH…            17            1        43            67           41       -42 0.00610 0.00820
 5 BETA-LACTAM                    41            8        91            43           34       -90 0.00173 0.00266
 6 BLEOMYCIN                      39            8        90            45           34       -89 0.00324 0.00449
 7 CHLORAMPHENICOL                41            8        91            43           34       -90 0.00173 0.00266
 8 CHLORAMPHENICOL/FL…            41            8        91            43           34       -90 0.00173 0.00266
 9 CLINDAMYCIN/LINCOM…            41            8        91            43           34       -90 0.00173 0.00266
10 FLORFENICOL/OXAZOL…            41            8        91            43           34       -90 0.00173 0.00266
# ℹ 22 more rows
# ℹ 3 more variables: prop_animal <dbl>, prop_human <dbl>, diff_prop <dbl>

8.1 Building an AMR abundance table

# select the AMR annotations
amr_abundance <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  group_by(genome, resistance_target) %>%
  summarise(count = n(), .groups="drop") %>%
  tidyr::pivot_wider(
    names_from = resistance_target,
    values_from = count,
    values_fill = 0
  )
amr_cluster <- amr_abundance %>%
  dplyr::full_join(metadata_with_cluster, by= join_by(genome == ID))
amr_cluster %>%
  dplyr::select("BETA-LACTAM", "MACROLIDE", genome, cluster) %>%
  group_by( cluster) %>%
  summarise(mean_beta_lactam = mean(`BETA-LACTAM`, na.rm = TRUE),
            mean_macrolide = mean(MACROLIDE, na.rm = TRUE))
# A tibble: 3 × 3
  cluster mean_beta_lactam mean_macrolide
  <fct>              <dbl>          <dbl>
1 1                   6.08           11.4
2 2                   6.08           10.9
3 <NA>                6.13           10.3
amr_matrix <- amr_abundance %>%
  column_to_rownames("genome") %>%
  as.matrix()

# Abundance heatmap
min_val <- min(amr_matrix, na.rm = TRUE)
max_val <- max(amr_matrix, na.rm = TRUE)

colors <- viridis(100, option = "viridis")

breaks <- seq(min_val, max_val, length.out = 101)   

pheatmap( amr_matrix,
  color = colors,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

amr_presence <- amr_abundance %>%
  column_to_rownames("genome") %>%   
  mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
  as.matrix()


pheatmap(
  amr_presence,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     # white = absent, black = present
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

pheatmap(
  amr_presence,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     # white = absent, black = present
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

8.2 PERMANOVA presence/absence

# Remove zero-variance AMR features
amr_pa_nz <- amr_presence[, colSums(amr_presence) > 0 & colSums(amr_presence) < nrow(amr_presence), drop = FALSE]

# Determine the common genomes/IDs between AMR matrix and metadata
common_ids <- base::intersect(rownames(amr_pa_nz), genome_metadata$ID)

# Report what will be kept/dropped
message("# common: ", length(common_ids))
# common: 50
message("# in AMR only: ", length(setdiff(rownames(amr_pa_nz), genome_metadata$ID)))
# in AMR only: 90
message("# in metadata only: ", length(setdiff(genome_metadata$ID, rownames(amr_pa_nz))))
# in metadata only: 77
# Subset AMR to the intersection (and keep the identical order)
amr_pa_nz <- amr_pa_nz[common_ids, , drop = FALSE]

# Build aligned metadata (unique IDs, same order as amr_pa_nz)
meta <- genome_metadata %>%
  dplyr::filter(ID %in% common_ids) %>%
  dplyr::distinct(ID, .keep_all = TRUE) %>%
  tibble::column_to_rownames("ID") %>%
  .[common_ids, , drop = FALSE]

stopifnot(identical(rownames(meta), rownames(amr_pa_nz)))

# Prepare variables for PERMANOVA
required_vars <- c("genome_size", "completeness", "host_type")

# Coerce types as needed
meta <- meta %>%
  dplyr::mutate(
    genome_size  = as.numeric(genome_size),
    completeness = as.numeric(completeness),
    host_type    = as.factor(host_type)
  )

# Align on complete cases for the required variables
ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
amr_pa_nz <- amr_pa_nz[ok, , drop = FALSE]
meta      <- meta[ok, , drop = FALSE]

stopifnot(identical(rownames(meta), rownames(amr_pa_nz)))

# Distance, dispersion, PERMANOVA
amr_dist_pa <- vegan::vegdist(amr_pa_nz, method = "jaccard", binary = TRUE)

disp_pa <- vegan::betadisper(amr_dist_pa, meta$host_type)
Warning in vegan::betadisper(amr_dist_pa, meta$host_type): some squared distances are negative and changed to
zero
anova(disp_pa)
Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value Pr(>F)
Groups     1 0.000057 0.0000566  0.0091 0.9242
Residuals 47 0.290868 0.0061887               
vegan::adonis2(
  amr_dist_pa ~ genome_size + completeness + host_type,
  data = meta,
  permutations = 999,
  by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = amr_dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)  
genome_size   1  0.01571 0.01467 0.7202  0.533  
completeness  1  0.01155 0.01079 0.5296  0.644  
host_type     1  0.06544 0.06113 3.0001  0.035 *
Residual     45  0.98148 0.91690                
Total        48  1.07044 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.3 PCoA presence/absence

pcoa_pa <- cmdscale(amr_dist_pa, eig = TRUE, k = 2)

variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)


pcoa_df <- data.frame(
  ID = rownames(amr_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  host_type = meta$host_type
) %>%
  left_join(metadata_with_cluster, by = "ID")


pcoa_amr_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_type_colors, name = "host_type") +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_amr_pa

ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_order_colors, name = "Host Order")+
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by host order)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by completeness)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by genome_size)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by country)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by continent)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of AMR annotations across MAGs (colored by functional cluster)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

8.4 Testing proportions (Fisher)

amr_counts <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  dplyr::count(genome, resistance_target) %>%            
  pivot_wider(names_from = resistance_target, values_from = n, values_fill = 0) %>%
  filter(rowSums(dplyr::select(., -genome)) > 0) # Removes MAGs with no AMR genes found


amr_binary <- amr_counts %>%
  left_join(genome_metadata %>% dplyr::select(ID, host_type), by = c("genome" = "ID")) %>%
  # Convert all numeric columns to 0 or 1
  mutate(across(where(is.numeric), ~ ifelse(.x > 0, 1, 0)))

#Function to run Fischer test for each gene
run_fisher <- function(gene_name, df) {
  tab <- table(df$host_type, df[[gene_name]])
  
  # Only run if the table is actually 2x2 
  if(ncol(tab) == 2) {
    test <- fisher.test(tab)
    return(tidy(test) %>% mutate(AMR_Target = gene_name))
  } else {
    return(NULL)
  }
}

# Apply the function to all AMR columns
amr_names <- colnames(amr_counts)[-1] # everything except mag_id

fisher_results <- map_df(amr_names, ~ run_fisher(.x, amr_binary))

# Adjust for multiple testing (FDR)
fisher_results <- fisher_results %>%
  mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
  filter(p.adj < 0.05) %>%
  arrange(p.adj)

print(fisher_results)
# A tibble: 0 × 8
# ℹ 8 variables: estimate <dbl>, p.value <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
#   alternative <chr>, AMR_Target <chr>, p.adj <dbl>