Chapter 3 EHI MAGs exploration

3.0.1 EHI MAGs

load("data/data.Rdata")
ehi_mags <- read_csv("data/ehi_mags.csv")
Rows: 8846 Columns: 52
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (36): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, class, order, family, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, coding_density, contig...
lgl  (1): annotated

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(ehi_mags)
 [1] "ID"                        "mag_name"                  "link_to_assembly"          "AB_batch"                 
 [5] "eha_number"                "GTDB_version"              "GTDB_release"              "domain"                   
 [9] "phylum"                    "class"                     "order"                     "family"                   
[13] "genus"                     "species"                   "fastani_ani"               "closest_placement_ani"    
[17] "closest_placement_af"      "completeness"              "contamination"             "size"                     
[21] "GC"                        "N50"                       "coding_density"            "contigs"                  
[25] "number_genes"              "number_unannotated_genes"  "DM_batch"                  "host_species"             
[29] "host_order"                "host_class"                "MAG_url"                   "anno_url"                 
[33] "kegg_url"                  "annotated"                 "taxonomy_level"            "assembly_type"            
[37] "sample_type"               "gbk_url"                   "kegg_hits"                 "Capture ID"               
[41] "percent_unannotated_genes" "percent_unannotated_kegg"  "dereplicated"              "metagenomic source"       
[45] "completeness software"     "binning software"          "assembly quality"          "binning parameters"       
[49] "taxonomic identity marker" "isolation_source"          "assembly software"         "Submission"               

Check contamination and completeness

genome_biplot <- ehi_mags %>%
  dplyr::select(c(phylum,completeness,contamination,size)) %>%
  ggplot(aes(x=completeness,y=contamination,size=size,color=phylum)) +
              geom_point(alpha=0.7) +
                    xlim(c(70,100)) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- ehi_mags %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

genome_completeness <- ehi_mags %>%
        ggplot(aes(x=completeness)) +
                xlim(c(70,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))
Warning: Removed 1721 rows containing non-finite outside the scale range (`stat_boxplot()`).
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
No shared levels found between `names(values)` of the manual scale and the data's colour values.
Warning: Removed 1721 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).

Filter by completeness and contamination

ehi_mags_filtered <- ehi_mags %>%
  filter(completeness > 90, contamination < 2.5)

3.0.2 Host classes

species_hostclass_counts <- ehi_mags %>%
  filter(!is.na(species), species != "") %>% 
  filter(!is.na(host_class), host_class != "") %>% 
  group_by(species) %>%
  summarise(
    n_mags = n(),
    n_host_classes = n_distinct(host_class),
    host_classes = paste(sort(unique(host_class)), collapse = ", "),
    phylum = paste(unique(phylum), collapse = ", ")
  ) %>%
  arrange(desc(n_host_classes))


species_selection <- species_hostclass_counts %>%
  filter(n_host_classes >= 3)

species_selection
# A tibble: 34 × 5
   species                         n_mags n_host_classes host_classes                       phylum             
   <chr>                            <int>          <int> <chr>                              <chr>              
 1 s__Hafnia alvei                     61              4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota  
 2 s__Pseudomonas aeruginosa           99              4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota  
 3 s__Aeromonas encheleia              18              3 Amphibia, Mammalia, Reptilia       p__Pseudomonadota  
 4 s__Bacteroides caccae               13              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
 5 s__Bacteroides fragilis             20              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
 6 s__Bacteroides nordii               23              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
 7 s__Bacteroides thetaiotaomicron     42              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
 8 s__Bacteroides uniformis            84              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
 9 s__Bacteroides xylanisolvens        27              3 Aves, Mammalia, Reptilia           p__Bacteroidota    
10 s__Bilophila wadsworthia            12              3 Aves, Mammalia, Reptilia           p__Desulfobacterota
# ℹ 24 more rows
species_selection %>% group_by(phylum)%>%
  summarize(n_species = n())
# A tibble: 5 × 2
  phylum              n_species
  <chr>                   <int>
1 p__Bacillota                5
2 p__Bacillota_A              2
3 p__Bacteroidota             8
4 p__Desulfobacterota         1
5 p__Pseudomonadota          18

3.0.3 Host classes- after filtering

species_hostclass_counts_f <- ehi_mags_filtered %>%
  dplyr::filter(!is.na(species), species != "") %>% 
  dplyr::filter(!is.na(host_class), host_class != "") %>% 
  group_by(species) %>%
  summarise(
    n_mags = n(),
    n_host_classes = n_distinct(host_class),
    host_classes = paste(sort(unique(host_class)), collapse = ", "),
    phylum = paste(unique(phylum), collapse = ", ")
  ) %>%
  arrange(desc(n_host_classes))


species_selection_f <- species_hostclass_counts_f %>%
  dplyr::filter(n_host_classes >= 3, n_mags > 10) %>%
  arrange(desc(n_mags))

species_selection_f
# A tibble: 10 × 5
   species                       n_mags n_host_classes host_classes                       phylum           
   <chr>                          <int>          <int> <chr>                              <chr>            
 1 s__Escherichia coli               64              3 Aves, Mammalia, Reptilia           p__Pseudomonadota
 2 s__Pseudomonas aeruginosa         55              4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota
 3 s__Lactococcus lactis             38              3 Aves, Mammalia, Reptilia           p__Bacillota     
 4 s__Hafnia paralvei                25              3 Amphibia, Mammalia, Reptilia       p__Pseudomonadota
 5 s__Parabacteroides distasonis     25              3 Aves, Mammalia, Reptilia           p__Bacteroidota  
 6 s__Enterococcus faecalis          21              3 Aves, Mammalia, Reptilia           p__Bacillota     
 7 s__Bacteroides uniformis          19              3 Aves, Mammalia, Reptilia           p__Bacteroidota  
 8 s__Phocaeicola vulgatus           19              3 Aves, Mammalia, Reptilia           p__Bacteroidota  
 9 s__Citrobacter braakii            16              3 Amphibia, Mammalia, Reptilia       p__Pseudomonadota
10 s__Moellerella wisconsensis       13              3 Amphibia, Aves, Mammalia           p__Pseudomonadota
species_selection_list<- species_selection_f %>% pull(species)
species_selection_f %>% group_by(phylum)%>%
  summarize(n_species = n())
# A tibble: 3 × 2
  phylum            n_species
  <chr>                 <int>
1 p__Bacillota              2
2 p__Bacteroidota           3
3 p__Pseudomonadota         5

3.0.4 Count host classes per species

species_hostclass_counts <- ehi_mags_filtered %>%
  dplyr::filter(!is.na(species), species != "",
         !is.na(host_class), host_class != "") %>%
  group_by(species) %>%
  summarise(
    n_host_classes = n_distinct(host_class)) %>%
  arrange(desc(n_host_classes))


counts_species_per_class<- species_hostclass_counts %>% 
  group_by(n_host_classes)%>%
  summarize(n_species = n())

ggplot(counts_species_per_class, aes(x = n_host_classes, y = n_species)) +
  geom_col() +
  geom_text(
    aes(label = n_species),
    vjust = -0.3,            
    size = 4
  ) +
  labs(
    title = "Number of Species per Number of Host Classes",
    x = "Number of Host Classes",
    y = "Number of Species"
  ) +
  theme_bw()

mags_with_counts <- ehi_mags_filtered %>%
  left_join(species_hostclass_counts, by = "species")


#filter for mags with >2 host classes
mags_host3 <- mags_with_counts %>%
  dplyr::filter(n_host_classes >= 3)

n_mags <- mags_host3 %>% 
  group_by(species) %>%
  summarise(n_mags = n(), .groups = "drop")


#Species level:
species_host3 <- mags_host3 %>%
  group_by(species)%>%
  summarise(phylum = dplyr::first(phylum),
            order = dplyr::first(order),
            family = dplyr::first(family),
            genus = dplyr::first(genus),
            host_classes = paste(sort(unique(host_class)), collapse = ", "),
            host_species = paste(sort(unique(host_species)), collapse = ", "))

We have several MAGs per species, so here we select the “best MAG” (highest completeness and lowest contamination), to have statistics based on the best of each.

best_mag_per_species <- mags_host3 %>%
  group_by(species) %>%
  arrange(desc(completeness), contamination) %>%
  slice_head(n = 1) %>%
  ungroup() %>%
  transmute(
    species,
    best_mag_name = mag_name,
    best_MAG_link = link_to_assembly,
    best_genome_size = size,
    best_completeness = completeness,
    best_contamination = contamination,
    best_N50 = N50,
    best_host_classes = host_class,  
    best_assembly_type = assembly_type, 
    best_phylum =str_remove_all(phylum, "p__")) 
ggplot(best_mag_per_species, aes(x= species, y = best_genome_size, fill = best_phylum))+
  scale_color_manual(values=phylum_colors) +
  scale_fill_manual(values=phylum_colors) +
  geom_col()+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90))
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.

Filtering for Parabacteroides distasonis (make into a loop for all the relevant species later)

p_dist <- ehi_mags %>%
  filter(completeness >90,
         contamination < 2.5,
         species == "s__Parabacteroides distasonis")

write_tsv(p_dist, "data/mags_metadata/parabacteroides_distasonis_metadata.tsv")

p_dist_index <- p_dist %>%
  dplyr::select(ID, MAG_url)

write_tsv(p_dist_index, "data/mags_metadata/parabacteroides_distasonis_index.tsv")
#Loop through each species in species_selection list
for (i in 1:nrow(species_selection_f)) {
  current_species <- species_selection_f$species[i]

  #Filter for completeness and contamination
  #get metadata
  species_data <- ehi_mags %>%
    filter(completeness > 90,
           contamination < 2.5,
           species == current_species)
  
  #get index
  species_index <- species_data  %>%
  dplyr::select(ID, MAG_url)

  #Filename from species name
  filename <- current_species %>%
    str_replace("s__", "") %>%
    str_replace_all(" ", "_") %>%
    tolower()

  #export to tsv
  write_tsv(species_data,
            paste0("data/mags_metadata/", filename, "_metadata.tsv"))
  
  write_tsv(species_index,
            paste0("data/mags_metadata/", filename, "_index.tsv"))
}
### First only with two:
all_mags <- ehi_mags %>%
  filter(completeness > 90,
         contamination < 2.5,
         species %in% c("s__Parabacteroides distasonis", "s__Phocaeicola vulgatus")) %>%
  mutate(species = str_remove(species, "^s__") %>% str_to_lower() %>%
           str_replace(" ", "_"),
    out_path = paste0("data/", species, "/mags/", basename(MAG_url))
  ) %>%
  dplyr::select(species, ID, MAG_url, out_path)
write_tsv(all_mags, "all_mags_index.tsv")