Chapter 8 Virulence factors analysis

8.0.1 Nº of MAGs with VFs

# AMR presence/absence
vf_presence <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1") %>%   # remove weird MAG
  filter(!is.na(vf)) %>%
  dplyr::select(mag_id, vf) %>%
  distinct() 

#Add the source info
vf_with_source <- vf_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, source),
    by = c("mag_id" = "ID")
  )

# Count how many MAGs in each vf
vf_mag_counts <- vf_with_source %>%
  group_by(source, vf) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
  filter(ID != "GCA_015060925.1") %>%
  group_by(source) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each source in each vf
vf_mag_proportions <- vf_mag_counts %>%
  left_join(total_mags_per_source, by = "source") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

8.0.1.1 Statistical testing of MAG proportions (Fisher)

fisher_results <- vf_mag_proportions %>%
  dplyr::select(vf, source, n_mags, absent) %>%
  pivot_wider(
    names_from = source,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_EHI,  absent_EHI,
          n_mags_GTDB, absent_GTDB),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_EHI  = n_mags_EHI  / (n_mags_EHI  + absent_EHI),
    prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
    diff_prop = prop_GTDB - prop_EHI
  )
fisher_results%>% filter(p_adj < 0.05) 
# A tibble: 1 × 10
  vf     n_mags_EHI n_mags_GTDB absent_EHI absent_GTDB   p_value   p_adj prop_EHI prop_GTDB diff_prop
  <chr>       <int>       <int>      <int>       <int>     <dbl>   <dbl>    <dbl>     <dbl>     <dbl>
1 VF0369          2          39         23          29 0.0000128 0.00443     0.08     0.574     0.494

8.1 Building a virulence factor abundance table

vf_data <- genome_annotations %>%
  dplyr::select(mag_id, vf, vf_type) 
vf_abundance <- genome_annotations %>%
  dplyr::select(mag_id, vf) %>%
  group_by(mag_id, vf) %>%
  summarise(count = n(), .groups="drop") %>%
  tidyr::pivot_wider(
    names_from = vf,
    values_from = count,
    values_fill = 0
  )

dim(vf_abundance)
[1]  93 347
vf_cluster <- vf_abundance %>%
  dplyr::full_join(genome_metadata, by= join_by(mag_id == ID))
# make into a matrix
vf_matrix <- vf_abundance %>%
  column_to_rownames("mag_id") %>%
  as.matrix()

# Unscaled abundance heatmap
pheatmap(vf_matrix,
  color = viridis(100, option = "viridis"),
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

# For scaled heatmap -> remove zero variance columns 
col_vars <- apply(vf_matrix, 2, var, na.rm = TRUE)
vf_matrix_filtered <- vf_matrix[, col_vars > 0 & !is.na(col_vars)]

# Scale the filtered matrix
vf_scaled <- scale(vf_matrix_filtered, center = TRUE, scale = TRUE)

# Check for any remaining Inf/NaN values
if(any(!is.finite(vf_scaled))) {
  vf_scaled[!is.finite(vf_scaled)] <- 0  # Replace Inf/NaN with 0
}

# Scaled heatmap
pheatmap(vf_scaled,
  color = viridis(100, option = "viridis"),
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  fontsize = 5,
  border_color = NA
)

8.2 Virulence presence/absence heatmap

vf_presence <- vf_abundance %>%
  column_to_rownames("mag_id") %>%   
  mutate(across(everything(), ~ ifelse(. > 0, 1, 0)))%>%
  as.matrix()


pheatmap(
  vf_presence,
  cluster_rows = FALSE,
  cluster_cols = TRUE,
  color = c("white", "darkblue"),     
  legend_breaks = c(0,1),
  legend_labels = c("Absent", "Present"),
  border_color = NA
)

8.3 Virulence presence/absence PERMANOVA

# remove zero-variance KOs
vf_pa_nz <- vf_presence[, colSums(vf_presence) > 0 & colSums(vf_presence) < nrow(vf_presence)]

meta <- genome_metadata %>%
  filter(ID %in% rownames(vf_pa_nz)) %>%
  column_to_rownames("ID")

# VERY IMPORTANT: enforce same order
meta <- meta[rownames(vf_pa_nz), ]

stopifnot(identical(rownames(meta), rownames(vf_pa_nz)))


vf_dist_pa <- vegdist(vf_pa_nz, method = "jaccard", binary = TRUE)

# dispersion check
disp_pa <- betadisper(vf_dist_pa, meta$source)
anova(disp_pa)
Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value   Pr(>F)   
Groups     1 0.021944 0.0219441  8.1762 0.005263 **
Residuals 91 0.244235 0.0026839                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA
adonis2(
  vf_dist_pa ~ genome_size +completeness+ source,
  data = meta,
  permutations = 999, by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = vf_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)   
genome_size   1  0.08815 0.03397 3.3817  0.002 **
completeness  1  0.01895 0.00731 0.7272  0.785   
source        1  0.05948 0.02292 2.2818  0.006 **
Residual     89  2.31982 0.89413                 
Total        92  2.59450 1.00000                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.4 Virulence relative abundance PERMANOVA

vf_counts <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1") %>%
  filter(!is.na(vf)) %>%
  dplyr::count(mag_id, vf) %>%            
  pivot_wider(names_from = vf, values_from = n, values_fill = 0) %>%
  filter(rowSums(dplyr::select(., -mag_id)) > 0) # Removes MAGs with no vf genes found

# Fixed normalization
vf_rel <- vf_counts %>%
  column_to_rownames("mag_id")
vf_rel <- vf_rel / rowSums(vf_rel)  # Each row sums to 1

# Remove zero variance
vf_rel_nz <- vf_rel[, apply(vf_rel, 2, sd) > 0]

# Distance matrix from normalized data
vf_dist <- vegdist(vf_rel_nz, method = "bray")  # or euclidean

# Add metadata
vf_rel_nz_meta <- vf_rel_nz %>%
  rownames_to_column("ID") %>%
  left_join(genome_metadata, by = "ID")

#Beta dispersion test
dispersion <- betadisper(vf_dist, vf_rel_nz_meta$source)
anova(dispersion) 
Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value    Pr(>F)    
Groups     1 0.007148 0.0071483  16.317 0.0001117 ***
Residuals 91 0.039866 0.0004381                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(vf_dist ~ source, 
                            data = vf_rel_nz_meta, 
                            permutations = 999)
print(permanova_result)
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = vf_dist ~ source, data = vf_rel_nz_meta, permutations = 999)
         Df SumOfSqs      R2      F Pr(>F)    
Model     1  0.05742 0.08119 8.0412  0.001 ***
Residual 91  0.64983 0.91881                  
Total    92  0.70725 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test accounting for genome size
adonis2(
  vf_dist ~ genome_size + source,
  data = vf_rel_nz_meta,
  permutations = 999,
  by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = vf_dist ~ genome_size + source, data = vf_rel_nz_meta, permutations = 999, by = "margin")
            Df SumOfSqs      R2       F Pr(>F)    
genome_size  1  0.10339 0.14619 17.0294  0.001 ***
source       1  0.00984 0.01392  1.6211  0.079 .  
Residual    90  0.54644 0.77262                   
Total       92  0.70725 1.00000                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.5 Virulence presence/absence PCoA

pcoa_pa <- cmdscale(vf_dist_pa, eig = TRUE, k = 2)

variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)


pcoa_df <- data.frame(
  ID = rownames(vf_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  source = meta$source
) %>%
  left_join(metadata_with_cluster, by = "ID")


pcoa_vf_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
  geom_point(size = 2) +
  scale_color_manual(values = source_colors, name = "Source") +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by completeness)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +

  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by genome_size)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by country)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by continent)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa_cluster <- ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs (colored by functional cluster)",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_vf_pa_cluster

8.6 Virulence relative abundance PCoA

pcoa_res <- cmdscale(vf_dist, k = 2, eig = TRUE)

# Calculate variance explained from PCoA eigenvalues
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)

pcoa_df <- data.frame(
  PC1 = pcoa_res$points[,1], 
  PC2 = pcoa_res$points[,2], 
  ID = rownames(vf_rel_nz)
) %>%
  left_join(metadata_with_cluster, by = "ID")


ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
  scale_color_manual(values = source_colors) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs colored by GIFT cluster",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs colored by GIFT cluster",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs colored by GIFT country",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of VF annotations across MAGs colored by GIFT continent",
    x = paste0("PCoA1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2] * 100, 1), "%)")
  )

8.7 ALDEX2

# Prepare count matrix for ALDEx2 (raw counts, not normalized)
vf_counts_aldex <- vf_counts %>%
  column_to_rownames("mag_id")

# Transpose for ALDEx2 (features as rows, samples as columns)
vf_counts_t <- t(vf_counts_aldex)

# Remove features with zero counts across all samples
vf_counts_t <- vf_counts_t[rowSums(vf_counts_t) > 0, ]

# Create conditions vector matching column order
conditions <- genome_metadata %>%
  filter(ID %in% colnames(vf_counts_t)) %>%
  arrange(match(ID, colnames(vf_counts_t))) %>%
  pull(source)

# Run ALDEx2
set.seed(123)
aldex_vf <- aldex.clr(vf_counts_t, conditions, mc.samples = 128, denom = "all")
conditions vector supplied
operating in serial mode
computing center with all features
aldex_vf_test <- aldex.ttest(aldex_vf, paired.test = FALSE)
aldex_vf_effect <- aldex.effect(aldex_vf, CI = TRUE)
operating in serial mode
sanity check complete
rab.all  complete
rab.win  complete
rab of samples complete
within sample difference calculated
between group difference calculated
group summaries calculated
unpaired effect size calculated
summarizing output
# Combine results
aldex_vf_results <- data.frame(aldex_vf_test, aldex_vf_effect) %>%
  rownames_to_column("vf_gene") %>%
  arrange(wi.eBH)

# Significant vf genes (FDR < 0.05)
significant_vf <- aldex_vf_results %>%
  filter(wi.eBH < 0.05) %>%
  mutate(enriched_in = ifelse(effect > 0, 
                               levels(factor(conditions))[2],
                               levels(factor(conditions))[1]))

cat("\nSignificant vf genes:", nrow(significant_vf), "\n")

Significant vf genes: 0 
print(significant_vf)
 [1] vf_gene      we.ep        we.eBH       wi.ep        wi.eBH       rab.all      rab.win.EHI  rab.win.GTDB diff.btw     diff.win    
[11] effect       effect.low   effect.high  overlap      enriched_in 
<0 rows> (o 0- extensión row.names)
# Visualization
ggplot(aldex_vf_results, aes(x = effect, y = -log10(wi.eBH))) +
  geom_point(aes(color = wi.eBH < 0.05), alpha = 0.6) +
  scale_color_manual(values = c("grey", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "ALDEx2 results for vf genes",
    x = "Effect size",
    y = "-log10(BH-adjusted p-value)",
    color = "Significant"
  )