Chapter 9 Virulence factors analysis

9.0.1 Nº of MAGs with VFs

# AMR presence/absence
vf_presence <- genome_annotations %>%
  filter(!is.na(vf)) %>%
  dplyr::select(genome, vf) %>%
  distinct() 

#Add the host_type info
vf_with_host_type <- vf_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )

# Count how many MAGs in each vf
vf_mag_counts <- vf_with_host_type %>%
  group_by(host_type, vf) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per host_type (except the outlier)
total_mags_per_host_type <- genome_metadata %>%
  group_by(host_type) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each host_type in each vf
vf_mag_proportions <- vf_mag_counts %>%
  left_join(total_mags_per_host_type, by = "host_type") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

9.0.1.1 Statistical testing of MAG proportions (Fisher)

fisher_results <- vf_mag_proportions %>%
  dplyr::select(vf, host_type, n_mags, absent) %>%
  pivot_wider(
    names_from = host_type,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_animal,  absent_animal,
          n_mags_human, absent_human),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_animal  = n_mags_animal  / (n_mags_animal  + absent_animal),
    prop_human = n_mags_human / (n_mags_human + absent_human),
    diff_prop = prop_human - prop_animal
  )

fisher_results%>% filter(p_adj < 0.05)

# A tibble: 200 × 12
   vf     n_mags_animal n_mags_human n_mags_NA absent_animal absent_human absent_NA  p_value   p_adj prop_animal
   <chr>          <int>        <int>     <int>         <int>        <int>     <int>    <dbl>   <dbl>       <dbl>
 1 VF0003            41            8        91            43           34       -90 0.00173  0.00397       0.488
 2 VF0014            40            8        89            44           34       -88 0.00189  0.00397       0.476
 3 VF0028            38            6        57            46           36       -56 0.000651 0.00397       0.452
 4 VF0033            41            8        91            43           34       -90 0.00173  0.00397       0.488
 5 VF0043            14            1        38            70           41       -37 0.0197   0.0354        0.167
 6 VF0044            41            8        91            43           34       -90 0.00173  0.00397       0.488
 7 VF0052            41            8        91            43           34       -90 0.00173  0.00397       0.488
 8 VF0056            40            8        91            44           34       -90 0.00189  0.00397       0.476
 9 VF0062            40            8        91            44           34       -90 0.00189  0.00397       0.476
10 VF0072            41            8        91            43           34       -90 0.00173  0.00397       0.488
# ℹ 190 more rows
# ℹ 2 more variables: prop_human <dbl>, diff_prop <dbl>

9.1 Building a virulence factor abundance table

vf_data <- genome_annotations %>%
  dplyr::select(genome, vf, vf_type)

vf_abundance <- genome_annotations %>%
  dplyr::select(genome, vf) %>%
  group_by(genome, vf) %>%
  summarise(count = n(), .groups="drop") %>%
  tidyr::pivot_wider(
    names_from = vf,
    values_from = count,
    values_fill = 0
  )

dim(vf_abundance)

[1] 140 361

vf_cluster <- vf_abundance %>%
  dplyr::full_join(genome_metadata, by= join_by(genome == ID))

# make into a matrix
vf_matrix <- vf_abundance %>%
  column_to_rownames("genome") %>%
  as.matrix()

# Unscaled abundance heatmap
pheatmap(vf_matrix,
  color = viridis(100, option = "viridis"),
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

# For scaled heatmap -> remove zero variance columns 
col_vars <- apply(vf_matrix, 2, var, na.rm = TRUE)
vf_matrix_filtered <- vf_matrix[, col_vars > 0 & !is.na(col_vars)]

# Scale the filtered matrix
vf_scaled <- scale(vf_matrix_filtered, center = TRUE, scale = TRUE)

# Check for any remaining Inf/NaN values
if(any(!is.finite(vf_scaled))) {
  vf_scaled[!is.finite(vf_scaled)] <- 0  # Replace Inf/NaN with 0
}

# Scaled heatmap
pheatmap(vf_scaled,
  color = viridis(100, option = "viridis"),
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 5,
  border_color = NA
)

9.2 Virulence presence/absence heatmap

vf_presence <- vf_abundance %>%
  column_to_rownames("genome") %>%   
  mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
  as.matrix()


pheatmap(
  vf_presence,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

9.3 Virulence presence/absence PERMANOVA

# remove zero-variance KOs
vf_pa_nz <- vf_presence[, colSums(vf_presence) > 0 & colSums(vf_presence) < nrow(vf_presence)]

# Determine the common genomes
common_ids <- base::intersect(rownames(vf_pa_nz), genome_metadata$ID)

# Report what will be kept/dropped
message("# common: ", length(common_ids))

# common: 50

message("# in KEGG only: ", length(setdiff(rownames(amr_pa_nz), genome_metadata$ID)))

# in KEGG only: 0

message("# in metadata only: ", length(setdiff(genome_metadata$ID, rownames(vf_pa_nz))))

# in metadata only: 77

# Subset to the intersection (and keep order identical)
vf_pa_nz <- vf_pa_nz[common_ids, , drop = FALSE]

meta <- genome_metadata %>%
  dplyr::filter(ID %in% common_ids) %>%
  dplyr::distinct(ID, .keep_all = TRUE) %>%
  tibble::column_to_rownames("ID") %>%
  .[common_ids, , drop = FALSE]

stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))

# VERY IMPORTANT: enforce same order
meta <- meta[rownames(vf_pa_nz), ]

stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))


# Prepare variables for PERMANOVA
required_vars <- c("genome_size", "completeness", "host_type")

# Coerce types as needed
meta <- meta %>%
  dplyr::mutate(
    genome_size  = as.numeric(genome_size),
    completeness = as.numeric(completeness),
    host_type    = as.factor(host_type)
  )

# Align on complete cases (adonis2 drops NAs otherwise)
ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
vf_pa_nz <- vf_pa_nz[ok, , drop = FALSE]
meta       <- meta[ok, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))

# Distance, dispersion, PERMANOVA
vf_dist_pa <- vegan::vegdist(vf_pa_nz, method = "jaccard", binary = TRUE)

disp_pa <- vegan::betadisper(vf_dist_pa, meta$host_type)
anova(disp_pa)

Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value Pr(>F)
Groups     1 0.006957 0.0069572  1.8452 0.1808
Residuals 47 0.177212 0.0037705

vegan::adonis2(
  vf_dist_pa ~ genome_size + completeness + host_type,
  data = meta,
  permutations = 999,
  by = "margin"
)

Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = vf_dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)  
genome_size   1  0.06177 0.03364 1.7091  0.072 .
completeness  1  0.08513 0.04637 2.3557  0.015 *
host_type     1  0.06348 0.03458 1.7566  0.047 *
Residual     45  1.62621 0.88580                
Total        48  1.83586 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

9.4 Virulence presence/absence PCoA

pcoa_pa <- cmdscale(vf_dist_pa, eig = TRUE, k = 2)

variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)


pcoa_df <- data.frame(
  ID = rownames(vf_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  host_type = meta$host_type
) %>%
  left_join(metadata_with_cluster, by = "ID")


pcoa_vf_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_type_colors, name = "host_type") +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa

ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_order_colors, na.value = "grey70")+
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by host order)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by completeness)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by genome_size)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by country)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by continent)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa_cluster <- ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by functional cluster)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa_cluster