Chapter 3 EHI MAGs exploration
3.0.1 EHI MAGs
Rows: 8857 Columns: 54
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (38): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, ...
lgl (1): annotated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] "ID" "mag_name" "link_to_assembly"
[4] "AB_batch" "eha_number" "GTDB_version"
[7] "GTDB_release" "domain" "phylum"
[10] "class" "order" "family"
[13] "genus" "species" "fastani_ani"
[16] "closest_placement_ani" "closest_placement_af" "completeness"
[19] "contamination" "size" "GC"
[22] "N50" "coding_density" "contigs"
[25] "number_genes" "number_unannotated_genes" "DM_batch"
[28] "host_species" "host_order" "host_class"
[31] "locality" "country" "MAG_url"
[34] "anno_url" "kegg_url" "annotated"
[37] "taxonomy_level" "assembly_type" "sample_type"
[40] "gbk_url" "kegg_hits" "Capture ID"
[43] "percent_unannotated_genes" "percent_unannotated_kegg" "dereplicated"
[46] "metagenomic source" "completeness software" "binning software"
[49] "assembly quality" "binning parameters" "taxonomic identity marker"
[52] "isolation_source" "assembly software" "Submission"
Check contamination and completeness
genome_biplot <- ehi_mags %>%
dplyr::select(c(phylum,completeness,contamination,size)) %>%
ggplot(aes(x=completeness,y=contamination,size=size,color=phylum)) +
geom_point(alpha=0.7) +
xlim(c(70,100)) +
ylim(c(10,0)) +
scale_color_manual(values=phylum_colors) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "none")
#Generate contamination boxplot
genome_contamination <- ehi_mags %>%
ggplot(aes(y=contamination)) +
ylim(c(10,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
genome_completeness <- ehi_mags %>%
ggplot(aes(x=completeness)) +
xlim(c(70,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))Warning: Removed 1725 rows containing non-finite outside the scale range (`stat_boxplot()`).
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
No shared levels found between `names(values)` of the manual scale and the data's colour values.
Warning: Removed 1725 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).
Filter by completeness and contamination
3.0.2 Host classes
species_hostclass_counts <- ehi_mags_filtered %>%
filter(!is.na(species), species != "") %>%
filter(!is.na(host_class), host_class != "") %>%
group_by(species) %>%
summarise(
n_mags = n(),
n_host_classes = n_distinct(host_class),
n_host_species = n_distinct(host_species),
host_classes = paste(sort(unique(host_class)), collapse = ", "),
host_species = paste(sort(unique(host_species)), collapse = ", "),
phylum = paste(unique(phylum), collapse = ", ")
) %>%
arrange(desc(n_host_classes))
species_selection_3 <- species_hostclass_counts %>%
filter(n_host_classes >= 3)
species_selection_2 <- species_hostclass_counts %>%
filter(n_host_classes >= 2)
species_selection_3 %>% arrange(-n_mags)# A tibble: 14 × 7
species n_mags n_host_classes n_host_species host_classes host_species phylum
<chr> <int> <int> <int> <chr> <chr> <chr>
1 s__Escherichia coli 64 3 20 Aves, Mammalia, Repti… Apodemus fl… p__Ps…
2 s__Pseudomonas aeruginosa 55 4 25 Amphibia, Aves, Mamma… Chloroceryl… p__Ps…
3 s__Lactococcus lactis 38 3 13 Aves, Mammalia, Repti… Barbastella… p__Ba…
4 s__Hafnia paralvei 25 3 10 Amphibia, Mammalia, R… Anguis frag… p__Ps…
5 s__Parabacteroides distasonis 25 3 10 Aves, Mammalia, Repti… Canis famil… p__Ba…
6 s__Enterococcus faecalis 21 3 11 Aves, Mammalia, Repti… Dasyurus ge… p__Ba…
7 s__Bacteroides uniformis 19 3 5 Aves, Mammalia, Repti… Canis famil… p__Ba…
8 s__Phocaeicola vulgatus 19 3 7 Aves, Mammalia, Repti… Canis famil… p__Ba…
9 s__Citrobacter braakii 16 3 9 Amphibia, Mammalia, R… Chalcides s… p__Ps…
10 s__Moellerella wisconsensis 13 3 5 Amphibia, Aves, Mamma… Geospizopsi… p__Ps…
11 s__Enterococcus_B pernyi 8 3 5 Aves, Mammalia, Repti… Miniopterus… p__Ba…
12 s__Buttiauxella gaviniae 7 3 3 Amphibia, Mammalia, R… Lissotriton… p__Ps…
13 s__Morganella morganii_A 7 3 4 Amphibia, Mammalia, R… Dasyurus ge… p__Ps…
14 s__Aeromonas encheleia 4 3 3 Amphibia, Mammalia, R… Lissotriton… p__Ps…
# A tibble: 63 × 7
species n_mags n_host_classes n_host_species host_classes host_species phylum
<chr> <int> <int> <int> <chr> <chr> <chr>
1 s__Escherichia coli 64 3 20 Aves, Mammalia, Rept… Apodemus fl… p__Ps…
2 s__Pseudomonas aeruginosa 55 4 25 Amphibia, Aves, Mamm… Chloroceryl… p__Ps…
3 s__Helicobacter_C marmotae 48 2 3 Aves, Mammalia Falco eleon… p__Ca…
4 s__Lactococcus lactis 38 3 13 Aves, Mammalia, Rept… Barbastella… p__Ba…
5 s__Hafnia paralvei 25 3 10 Amphibia, Mammalia, … Anguis frag… p__Ps…
6 s__Parabacteroides distasonis 25 3 10 Aves, Mammalia, Rept… Canis famil… p__Ba…
7 s__Enterococcus faecalis 21 3 11 Aves, Mammalia, Rept… Dasyurus ge… p__Ba…
8 s__Bacteroides uniformis 19 3 5 Aves, Mammalia, Rept… Canis famil… p__Ba…
9 s__Phocaeicola vulgatus 19 3 7 Aves, Mammalia, Rept… Canis famil… p__Ba…
10 s__Parabacteroides goldsteinii 17 2 8 Mammalia, Reptilia Lepus europ… p__Ba…
# ℹ 53 more rows
# A tibble: 7 × 2
phylum n_species
<chr> <int>
1 p__Bacillota 13
2 p__Bacillota_A 8
3 p__Bacteroidota 14
4 p__Campylobacterota 1
5 p__Fusobacteriota 1
6 p__Pseudomonadota 25
7 p__Verrucomicrobiota 1
# A tibble: 3 × 2
phylum n_species
<chr> <int>
1 p__Bacillota 3
2 p__Bacteroidota 3
3 p__Pseudomonadota 8
3.0.3 Host classes- after filtering
species_selection_f <- species_selection_2 %>%
dplyr::filter(n_mags > 10) %>%
arrange(desc(n_mags))
species_selection_f# A tibble: 22 × 7
species n_mags n_host_classes n_host_species host_classes host_species phylum
<chr> <int> <int> <int> <chr> <chr> <chr>
1 s__Escherichia coli 64 3 20 Aves, Mammalia, Rept… Apodemus fl… p__Ps…
2 s__Pseudomonas aeruginosa 55 4 25 Amphibia, Aves, Mamm… Chloroceryl… p__Ps…
3 s__Helicobacter_C marmotae 48 2 3 Aves, Mammalia Falco eleon… p__Ca…
4 s__Lactococcus lactis 38 3 13 Aves, Mammalia, Rept… Barbastella… p__Ba…
5 s__Hafnia paralvei 25 3 10 Amphibia, Mammalia, … Anguis frag… p__Ps…
6 s__Parabacteroides distasonis 25 3 10 Aves, Mammalia, Rept… Canis famil… p__Ba…
7 s__Enterococcus faecalis 21 3 11 Aves, Mammalia, Rept… Dasyurus ge… p__Ba…
8 s__Bacteroides uniformis 19 3 5 Aves, Mammalia, Rept… Canis famil… p__Ba…
9 s__Phocaeicola vulgatus 19 3 7 Aves, Mammalia, Rept… Canis famil… p__Ba…
10 s__Parabacteroides goldsteinii 17 2 8 Mammalia, Reptilia Lepus europ… p__Ba…
# ℹ 12 more rows
# A tibble: 7 × 2
phylum n_species
<chr> <int>
1 p__Bacillota 4
2 p__Bacillota_A 2
3 p__Bacteroidota 4
4 p__Campylobacterota 1
5 p__Fusobacteriota 1
6 p__Pseudomonadota 9
7 p__Verrucomicrobiota 1
3.0.4 Count host classes per species
counts_species_per_class<- species_selection_f %>%
group_by(n_host_classes)%>%
summarize(n_species = n())
ggplot(counts_species_per_class, aes(x = n_host_classes, y = n_species)) +
geom_col() +
geom_text(
aes(label = n_species),
vjust = -0.3,
size = 4
) +
labs(
title = "Number of Species per Number of Host Classes",
x = "Number of Host Classes",
y = "Number of Species"
) +
theme_bw()mags_with_counts <- ehi_mags_filtered %>%
left_join(species_hostclass_counts, by = "species")
#filter for mags with >1 host class
mags_host2 <- mags_with_counts %>%
dplyr::filter(n_host_classes >= 2)
n_mags <- mags_host2 %>%
group_by(species) %>%
summarise(n_mags = n(), .groups = "drop")We have several MAGs per species, so here we select the “best MAG” (highest completeness and lowest contamination), to have statistics based on the best of each.
best_mag_per_species <- mags_host2 %>%
filter(species %in% species_selection_list) %>%
group_by(species) %>%
arrange(desc(completeness), contamination) %>%
slice_head(n = 1) %>%
ungroup() %>%
transmute(
species,
best_mag_name = mag_name,
best_MAG_link = link_to_assembly,
best_genome_size = size,
best_completeness = completeness,
best_contamination = contamination,
best_N50 = N50,
best_host_classes = host_class,
best_assembly_type = assembly_type,
best_phylum =str_remove_all(phylum.x, "p__")) ggplot(best_mag_per_species, aes(x= species, y = best_genome_size, fill = best_phylum))+
scale_color_manual(values=phylum_colors) +
scale_fill_manual(values=phylum_colors) +
geom_col()+
theme_bw()+
theme(axis.text.x = element_text(angle = 90))Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
Filtering for Parabacteroides distasonis (make into a loop for all the relevant species later)
p_dist <- ehi_mags %>%
filter(completeness >90,
contamination < 2.5,
species == "s__Parabacteroides distasonis")
write_tsv(p_dist, "data/mags_metadata/parabacteroides_distasonis_metadata.tsv")
p_dist_index <- p_dist %>%
dplyr::select(ID, MAG_url)
write_tsv(p_dist_index, "data/mags_metadata/parabacteroides_distasonis_index.tsv")species_selection_f <- c(
"lactococcus_lactis",
"hafnia_paralvei",
"enterococcus_faecalis",
"enterococcus_hirae",
"bacteroides_uniformis",
"phocaeicola_vulgatus",
"parabacteroides_goldsteinii",
"akkermansia_muciniphila",
"bacteroides_fragilis",
"providencia_rettgeri",
"citrobacter_braakii"
)
ehi_mags_clean <- ehi_mags %>%
mutate(
species_clean = species %>%
str_replace("^s__", "") %>%
str_to_lower() %>%
str_replace_all(" ", "_")
)
for (current_species in species_selection_f) {
message("Processing ", current_species)
species_data <- ehi_mags_clean %>%
filter(
completeness > 90,
contamination < 2.5,
species_clean == current_species
)
species_index <- species_data %>%
dplyr::select(ID, MAG_url)
filename <- current_species %>%
str_replace("s__", "") %>%
str_replace_all(" ", "_") %>%
tolower()
write_tsv(
species_data,
paste0("data/mags_metadata/", filename, "_metadata.tsv")
)
write_tsv(
species_index,
paste0("data/mags_metadata/", filename, "_index.tsv")
)
}### First only with two:
all_mags <- ehi_mags %>%
filter(completeness > 90,
contamination < 2.5,
species %in% c("s__Parabacteroides distasonis", "s__Phocaeicola vulgatus")) %>%
mutate(species = str_remove(species, "^s__") %>% str_to_lower() %>%
str_replace(" ", "_"),
out_path = paste0("data/", species, "/mags/", basename(MAG_url))
) %>%
dplyr::select(species, ID, MAG_url, out_path)
write_tsv(all_mags, "all_mags_index.tsv")