Chapter 3 EHI MAGs exploration

3.0.1 EHI MAGs

load("data/data.Rdata")
library(tidyverse)
ehi_mags <- read_csv("data/ehi_metadata_location.csv")
Rows: 8857 Columns: 54
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (38): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, ...
lgl  (1): annotated

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(ehi_mags)
 [1] "ID"                        "mag_name"                  "link_to_assembly"         
 [4] "AB_batch"                  "eha_number"                "GTDB_version"             
 [7] "GTDB_release"              "domain"                    "phylum"                   
[10] "class"                     "order"                     "family"                   
[13] "genus"                     "species"                   "fastani_ani"              
[16] "closest_placement_ani"     "closest_placement_af"      "completeness"             
[19] "contamination"             "size"                      "GC"                       
[22] "N50"                       "coding_density"            "contigs"                  
[25] "number_genes"              "number_unannotated_genes"  "DM_batch"                 
[28] "host_species"              "host_order"                "host_class"               
[31] "locality"                  "country"                   "MAG_url"                  
[34] "anno_url"                  "kegg_url"                  "annotated"                
[37] "taxonomy_level"            "assembly_type"             "sample_type"              
[40] "gbk_url"                   "kegg_hits"                 "Capture ID"               
[43] "percent_unannotated_genes" "percent_unannotated_kegg"  "dereplicated"             
[46] "metagenomic source"        "completeness software"     "binning software"         
[49] "assembly quality"          "binning parameters"        "taxonomic identity marker"
[52] "isolation_source"          "assembly software"         "Submission"               

Check contamination and completeness

genome_biplot <- ehi_mags %>%
  dplyr::select(c(phylum,completeness,contamination,size)) %>%
  ggplot(aes(x=completeness,y=contamination,size=size,color=phylum)) +
              geom_point(alpha=0.7) +
                    xlim(c(70,100)) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- ehi_mags %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

genome_completeness <- ehi_mags %>%
        ggplot(aes(x=completeness)) +
                xlim(c(70,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))
Warning: Removed 1725 rows containing non-finite outside the scale range (`stat_boxplot()`).
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
No shared levels found between `names(values)` of the manual scale and the data's colour values.
Warning: Removed 1725 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).

Filter by completeness and contamination

ehi_mags_filtered <- ehi_mags %>%
  filter(completeness > 90, contamination < 2.5)

3.0.2 Host classes

species_hostclass_counts <- ehi_mags_filtered %>%
  filter(!is.na(species), species != "") %>% 
  filter(!is.na(host_class), host_class != "") %>% 
  group_by(species) %>%
  summarise(
    n_mags = n(),
    n_host_classes = n_distinct(host_class),
    n_host_species = n_distinct(host_species),
    host_classes = paste(sort(unique(host_class)), collapse = ", "),
    host_species = paste(sort(unique(host_species)), collapse = ", "),
    phylum = paste(unique(phylum), collapse = ", ")
  ) %>%
  arrange(desc(n_host_classes))


species_selection_3 <- species_hostclass_counts %>%
  filter(n_host_classes >= 3)

species_selection_2 <- species_hostclass_counts %>%
  filter(n_host_classes >= 2)

species_selection_3 %>% arrange(-n_mags)
# A tibble: 14 × 7
   species                       n_mags n_host_classes n_host_species host_classes           host_species phylum
   <chr>                          <int>          <int>          <int> <chr>                  <chr>        <chr> 
 1 s__Escherichia coli               64              3             20 Aves, Mammalia, Repti… Apodemus fl… p__Ps…
 2 s__Pseudomonas aeruginosa         55              4             25 Amphibia, Aves, Mamma… Chloroceryl… p__Ps…
 3 s__Lactococcus lactis             38              3             13 Aves, Mammalia, Repti… Barbastella… p__Ba…
 4 s__Hafnia paralvei                25              3             10 Amphibia, Mammalia, R… Anguis frag… p__Ps…
 5 s__Parabacteroides distasonis     25              3             10 Aves, Mammalia, Repti… Canis famil… p__Ba…
 6 s__Enterococcus faecalis          21              3             11 Aves, Mammalia, Repti… Dasyurus ge… p__Ba…
 7 s__Bacteroides uniformis          19              3              5 Aves, Mammalia, Repti… Canis famil… p__Ba…
 8 s__Phocaeicola vulgatus           19              3              7 Aves, Mammalia, Repti… Canis famil… p__Ba…
 9 s__Citrobacter braakii            16              3              9 Amphibia, Mammalia, R… Chalcides s… p__Ps…
10 s__Moellerella wisconsensis       13              3              5 Amphibia, Aves, Mamma… Geospizopsi… p__Ps…
11 s__Enterococcus_B pernyi           8              3              5 Aves, Mammalia, Repti… Miniopterus… p__Ba…
12 s__Buttiauxella gaviniae           7              3              3 Amphibia, Mammalia, R… Lissotriton… p__Ps…
13 s__Morganella morganii_A           7              3              4 Amphibia, Mammalia, R… Dasyurus ge… p__Ps…
14 s__Aeromonas encheleia             4              3              3 Amphibia, Mammalia, R… Lissotriton… p__Ps…
species_selection_2 %>% arrange(-n_mags)
# A tibble: 63 × 7
   species                        n_mags n_host_classes n_host_species host_classes          host_species phylum
   <chr>                           <int>          <int>          <int> <chr>                 <chr>        <chr> 
 1 s__Escherichia coli                64              3             20 Aves, Mammalia, Rept… Apodemus fl… p__Ps…
 2 s__Pseudomonas aeruginosa          55              4             25 Amphibia, Aves, Mamm… Chloroceryl… p__Ps…
 3 s__Helicobacter_C marmotae         48              2              3 Aves, Mammalia        Falco eleon… p__Ca…
 4 s__Lactococcus lactis              38              3             13 Aves, Mammalia, Rept… Barbastella… p__Ba…
 5 s__Hafnia paralvei                 25              3             10 Amphibia, Mammalia, … Anguis frag… p__Ps…
 6 s__Parabacteroides distasonis      25              3             10 Aves, Mammalia, Rept… Canis famil… p__Ba…
 7 s__Enterococcus faecalis           21              3             11 Aves, Mammalia, Rept… Dasyurus ge… p__Ba…
 8 s__Bacteroides uniformis           19              3              5 Aves, Mammalia, Rept… Canis famil… p__Ba…
 9 s__Phocaeicola vulgatus            19              3              7 Aves, Mammalia, Rept… Canis famil… p__Ba…
10 s__Parabacteroides goldsteinii     17              2              8 Mammalia, Reptilia    Lepus europ… p__Ba…
# ℹ 53 more rows
species_selection_2 %>% group_by(phylum)%>%
  summarize(n_species = n())
# A tibble: 7 × 2
  phylum               n_species
  <chr>                    <int>
1 p__Bacillota                13
2 p__Bacillota_A               8
3 p__Bacteroidota             14
4 p__Campylobacterota          1
5 p__Fusobacteriota            1
6 p__Pseudomonadota           25
7 p__Verrucomicrobiota         1
species_selection_3 %>% group_by(phylum)%>%
  summarize(n_species = n())
# A tibble: 3 × 2
  phylum            n_species
  <chr>                 <int>
1 p__Bacillota              3
2 p__Bacteroidota           3
3 p__Pseudomonadota         8

3.0.3 Host classes- after filtering

species_selection_f <- species_selection_2 %>%
  dplyr::filter(n_mags > 10) %>%
  arrange(desc(n_mags))

species_selection_f
# A tibble: 22 × 7
   species                        n_mags n_host_classes n_host_species host_classes          host_species phylum
   <chr>                           <int>          <int>          <int> <chr>                 <chr>        <chr> 
 1 s__Escherichia coli                64              3             20 Aves, Mammalia, Rept… Apodemus fl… p__Ps…
 2 s__Pseudomonas aeruginosa          55              4             25 Amphibia, Aves, Mamm… Chloroceryl… p__Ps…
 3 s__Helicobacter_C marmotae         48              2              3 Aves, Mammalia        Falco eleon… p__Ca…
 4 s__Lactococcus lactis              38              3             13 Aves, Mammalia, Rept… Barbastella… p__Ba…
 5 s__Hafnia paralvei                 25              3             10 Amphibia, Mammalia, … Anguis frag… p__Ps…
 6 s__Parabacteroides distasonis      25              3             10 Aves, Mammalia, Rept… Canis famil… p__Ba…
 7 s__Enterococcus faecalis           21              3             11 Aves, Mammalia, Rept… Dasyurus ge… p__Ba…
 8 s__Bacteroides uniformis           19              3              5 Aves, Mammalia, Rept… Canis famil… p__Ba…
 9 s__Phocaeicola vulgatus            19              3              7 Aves, Mammalia, Rept… Canis famil… p__Ba…
10 s__Parabacteroides goldsteinii     17              2              8 Mammalia, Reptilia    Lepus europ… p__Ba…
# ℹ 12 more rows
species_selection_list<- species_selection_f %>% pull(species)
species_selection_f %>% group_by(phylum)%>%
  summarize(n_species = n())
# A tibble: 7 × 2
  phylum               n_species
  <chr>                    <int>
1 p__Bacillota                 4
2 p__Bacillota_A               2
3 p__Bacteroidota              4
4 p__Campylobacterota          1
5 p__Fusobacteriota            1
6 p__Pseudomonadota            9
7 p__Verrucomicrobiota         1

3.0.4 Count host classes per species

counts_species_per_class<- species_selection_f %>% 
  group_by(n_host_classes)%>%
  summarize(n_species = n())

ggplot(counts_species_per_class, aes(x = n_host_classes, y = n_species)) +
  geom_col() +
  geom_text(
    aes(label = n_species),
    vjust = -0.3,            
    size = 4
  ) +
  labs(
    title = "Number of Species per Number of Host Classes",
    x = "Number of Host Classes",
    y = "Number of Species"
  ) +
  theme_bw()

mags_with_counts <- ehi_mags_filtered %>%
  left_join(species_hostclass_counts, by = "species")


#filter for mags with >1 host class
mags_host2 <- mags_with_counts %>%
  dplyr::filter(n_host_classes >= 2)

n_mags <- mags_host2 %>% 
  group_by(species) %>%
  summarise(n_mags = n(), .groups = "drop")

We have several MAGs per species, so here we select the “best MAG” (highest completeness and lowest contamination), to have statistics based on the best of each.

best_mag_per_species <- mags_host2 %>%
  filter(species %in% species_selection_list) %>%
  group_by(species) %>%
  arrange(desc(completeness), contamination) %>%
  slice_head(n = 1) %>%
  ungroup() %>%
  transmute(
    species,
    best_mag_name = mag_name,
    best_MAG_link = link_to_assembly,
    best_genome_size = size,
    best_completeness = completeness,
    best_contamination = contamination,
    best_N50 = N50,
    best_host_classes = host_class,  
    best_assembly_type = assembly_type, 
    best_phylum =str_remove_all(phylum.x, "p__")) 
ggplot(best_mag_per_species, aes(x= species, y = best_genome_size, fill = best_phylum))+
  scale_color_manual(values=phylum_colors) +
  scale_fill_manual(values=phylum_colors) +
  geom_col()+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90))
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.

Filtering for Parabacteroides distasonis (make into a loop for all the relevant species later)

p_dist <- ehi_mags %>%
  filter(completeness >90,
         contamination < 2.5,
         species == "s__Parabacteroides distasonis")

write_tsv(p_dist, "data/mags_metadata/parabacteroides_distasonis_metadata.tsv")

p_dist_index <- p_dist %>%
  dplyr::select(ID, MAG_url)

write_tsv(p_dist_index, "data/mags_metadata/parabacteroides_distasonis_index.tsv")
species_selection_f <- c(
  "lactococcus_lactis",
  "hafnia_paralvei",
  "enterococcus_faecalis",
  "enterococcus_hirae",
  "bacteroides_uniformis",
  "phocaeicola_vulgatus",
  "parabacteroides_goldsteinii",
  "akkermansia_muciniphila",
  "bacteroides_fragilis",
  "providencia_rettgeri",
  "citrobacter_braakii"
)


ehi_mags_clean <- ehi_mags %>%
  mutate(
    species_clean = species %>% 
      str_replace("^s__", "") %>% 
      str_to_lower() %>% 
      str_replace_all(" ", "_")
  )


for (current_species in species_selection_f) {

  message("Processing ", current_species)

  species_data <- ehi_mags_clean %>%
    filter(
      completeness > 90,
      contamination < 2.5,
      species_clean == current_species
    )

  species_index <- species_data %>%
    dplyr::select(ID, MAG_url)

  filename <- current_species %>%
    str_replace("s__", "") %>%
    str_replace_all(" ", "_") %>%
    tolower()

  write_tsv(
    species_data,
    paste0("data/mags_metadata/", filename, "_metadata.tsv")
  )

  write_tsv(
    species_index,
    paste0("data/mags_metadata/", filename, "_index.tsv")
  )
}
### First only with two:
all_mags <- ehi_mags %>%
  filter(completeness > 90,
         contamination < 2.5,
         species %in% c("s__Parabacteroides distasonis", "s__Phocaeicola vulgatus")) %>%
  mutate(species = str_remove(species, "^s__") %>% str_to_lower() %>%
           str_replace(" ", "_"),
    out_path = paste0("data/", species, "/mags/", basename(MAG_url))
  ) %>%
  dplyr::select(species, ID, MAG_url, out_path)
write_tsv(all_mags, "all_mags_index.tsv")