Chapter 3 EHI MAGs exploration
3.0.1 EHI MAGs
Rows: 8846 Columns: 52
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (36): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, class, order, family, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, coding_density, contig...
lgl (1): annotated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] "ID" "mag_name" "link_to_assembly" "AB_batch"
[5] "eha_number" "GTDB_version" "GTDB_release" "domain"
[9] "phylum" "class" "order" "family"
[13] "genus" "species" "fastani_ani" "closest_placement_ani"
[17] "closest_placement_af" "completeness" "contamination" "size"
[21] "GC" "N50" "coding_density" "contigs"
[25] "number_genes" "number_unannotated_genes" "DM_batch" "host_species"
[29] "host_order" "host_class" "MAG_url" "anno_url"
[33] "kegg_url" "annotated" "taxonomy_level" "assembly_type"
[37] "sample_type" "gbk_url" "kegg_hits" "Capture ID"
[41] "percent_unannotated_genes" "percent_unannotated_kegg" "dereplicated" "metagenomic source"
[45] "completeness software" "binning software" "assembly quality" "binning parameters"
[49] "taxonomic identity marker" "isolation_source" "assembly software" "Submission"
Check contamination and completeness
genome_biplot <- ehi_mags %>%
dplyr::select(c(phylum,completeness,contamination,size)) %>%
ggplot(aes(x=completeness,y=contamination,size=size,color=phylum)) +
geom_point(alpha=0.7) +
xlim(c(70,100)) +
ylim(c(10,0)) +
scale_color_manual(values=phylum_colors) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "none")
#Generate contamination boxplot
genome_contamination <- ehi_mags %>%
ggplot(aes(y=contamination)) +
ylim(c(10,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
genome_completeness <- ehi_mags %>%
ggplot(aes(x=completeness)) +
xlim(c(70,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))Warning: Removed 1721 rows containing non-finite outside the scale range (`stat_boxplot()`).
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
No shared levels found between `names(values)` of the manual scale and the data's colour values.
Warning: Removed 1721 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).
Filter by completeness and contamination
3.0.2 Host classes
species_hostclass_counts <- ehi_mags %>%
filter(!is.na(species), species != "") %>%
filter(!is.na(host_class), host_class != "") %>%
group_by(species) %>%
summarise(
n_mags = n(),
n_host_classes = n_distinct(host_class),
host_classes = paste(sort(unique(host_class)), collapse = ", "),
phylum = paste(unique(phylum), collapse = ", ")
) %>%
arrange(desc(n_host_classes))
species_selection <- species_hostclass_counts %>%
filter(n_host_classes >= 3)
species_selection# A tibble: 34 × 5
species n_mags n_host_classes host_classes phylum
<chr> <int> <int> <chr> <chr>
1 s__Hafnia alvei 61 4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota
2 s__Pseudomonas aeruginosa 99 4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota
3 s__Aeromonas encheleia 18 3 Amphibia, Mammalia, Reptilia p__Pseudomonadota
4 s__Bacteroides caccae 13 3 Aves, Mammalia, Reptilia p__Bacteroidota
5 s__Bacteroides fragilis 20 3 Aves, Mammalia, Reptilia p__Bacteroidota
6 s__Bacteroides nordii 23 3 Aves, Mammalia, Reptilia p__Bacteroidota
7 s__Bacteroides thetaiotaomicron 42 3 Aves, Mammalia, Reptilia p__Bacteroidota
8 s__Bacteroides uniformis 84 3 Aves, Mammalia, Reptilia p__Bacteroidota
9 s__Bacteroides xylanisolvens 27 3 Aves, Mammalia, Reptilia p__Bacteroidota
10 s__Bilophila wadsworthia 12 3 Aves, Mammalia, Reptilia p__Desulfobacterota
# ℹ 24 more rows
# A tibble: 5 × 2
phylum n_species
<chr> <int>
1 p__Bacillota 5
2 p__Bacillota_A 2
3 p__Bacteroidota 8
4 p__Desulfobacterota 1
5 p__Pseudomonadota 18
3.0.3 Host classes- after filtering
species_hostclass_counts_f <- ehi_mags_filtered %>%
dplyr::filter(!is.na(species), species != "") %>%
dplyr::filter(!is.na(host_class), host_class != "") %>%
group_by(species) %>%
summarise(
n_mags = n(),
n_host_classes = n_distinct(host_class),
host_classes = paste(sort(unique(host_class)), collapse = ", "),
phylum = paste(unique(phylum), collapse = ", ")
) %>%
arrange(desc(n_host_classes))
species_selection_f <- species_hostclass_counts_f %>%
dplyr::filter(n_host_classes >= 3, n_mags > 10) %>%
arrange(desc(n_mags))
species_selection_f# A tibble: 10 × 5
species n_mags n_host_classes host_classes phylum
<chr> <int> <int> <chr> <chr>
1 s__Escherichia coli 64 3 Aves, Mammalia, Reptilia p__Pseudomonadota
2 s__Pseudomonas aeruginosa 55 4 Amphibia, Aves, Mammalia, Reptilia p__Pseudomonadota
3 s__Lactococcus lactis 38 3 Aves, Mammalia, Reptilia p__Bacillota
4 s__Hafnia paralvei 25 3 Amphibia, Mammalia, Reptilia p__Pseudomonadota
5 s__Parabacteroides distasonis 25 3 Aves, Mammalia, Reptilia p__Bacteroidota
6 s__Enterococcus faecalis 21 3 Aves, Mammalia, Reptilia p__Bacillota
7 s__Bacteroides uniformis 19 3 Aves, Mammalia, Reptilia p__Bacteroidota
8 s__Phocaeicola vulgatus 19 3 Aves, Mammalia, Reptilia p__Bacteroidota
9 s__Citrobacter braakii 16 3 Amphibia, Mammalia, Reptilia p__Pseudomonadota
10 s__Moellerella wisconsensis 13 3 Amphibia, Aves, Mammalia p__Pseudomonadota
# A tibble: 3 × 2
phylum n_species
<chr> <int>
1 p__Bacillota 2
2 p__Bacteroidota 3
3 p__Pseudomonadota 5
3.0.4 Count host classes per species
species_hostclass_counts <- ehi_mags_filtered %>%
dplyr::filter(!is.na(species), species != "",
!is.na(host_class), host_class != "") %>%
group_by(species) %>%
summarise(
n_host_classes = n_distinct(host_class)) %>%
arrange(desc(n_host_classes))
counts_species_per_class<- species_hostclass_counts %>%
group_by(n_host_classes)%>%
summarize(n_species = n())
ggplot(counts_species_per_class, aes(x = n_host_classes, y = n_species)) +
geom_col() +
geom_text(
aes(label = n_species),
vjust = -0.3,
size = 4
) +
labs(
title = "Number of Species per Number of Host Classes",
x = "Number of Host Classes",
y = "Number of Species"
) +
theme_bw()mags_with_counts <- ehi_mags_filtered %>%
left_join(species_hostclass_counts, by = "species")
#filter for mags with >2 host classes
mags_host3 <- mags_with_counts %>%
dplyr::filter(n_host_classes >= 3)
n_mags <- mags_host3 %>%
group_by(species) %>%
summarise(n_mags = n(), .groups = "drop")
#Species level:
species_host3 <- mags_host3 %>%
group_by(species)%>%
summarise(phylum = dplyr::first(phylum),
order = dplyr::first(order),
family = dplyr::first(family),
genus = dplyr::first(genus),
host_classes = paste(sort(unique(host_class)), collapse = ", "),
host_species = paste(sort(unique(host_species)), collapse = ", "))We have several MAGs per species, so here we select the “best MAG” (highest completeness and lowest contamination), to have statistics based on the best of each.
best_mag_per_species <- mags_host3 %>%
group_by(species) %>%
arrange(desc(completeness), contamination) %>%
slice_head(n = 1) %>%
ungroup() %>%
transmute(
species,
best_mag_name = mag_name,
best_MAG_link = link_to_assembly,
best_genome_size = size,
best_completeness = completeness,
best_contamination = contamination,
best_N50 = N50,
best_host_classes = host_class,
best_assembly_type = assembly_type,
best_phylum =str_remove_all(phylum, "p__")) ggplot(best_mag_per_species, aes(x= species, y = best_genome_size, fill = best_phylum))+
scale_color_manual(values=phylum_colors) +
scale_fill_manual(values=phylum_colors) +
geom_col()+
theme_bw()+
theme(axis.text.x = element_text(angle = 90))Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
Filtering for Parabacteroides distasonis (make into a loop for all the relevant species later)
p_dist <- ehi_mags %>%
filter(completeness >90,
contamination < 2.5,
species == "s__Parabacteroides distasonis")
write_tsv(p_dist, "data/mags_metadata/parabacteroides_distasonis_metadata.tsv")
p_dist_index <- p_dist %>%
dplyr::select(ID, MAG_url)
write_tsv(p_dist_index, "data/mags_metadata/parabacteroides_distasonis_index.tsv")#Loop through each species in species_selection list
for (i in 1:nrow(species_selection_f)) {
current_species <- species_selection_f$species[i]
#Filter for completeness and contamination
#get metadata
species_data <- ehi_mags %>%
filter(completeness > 90,
contamination < 2.5,
species == current_species)
#get index
species_index <- species_data %>%
dplyr::select(ID, MAG_url)
#Filename from species name
filename <- current_species %>%
str_replace("s__", "") %>%
str_replace_all(" ", "_") %>%
tolower()
#export to tsv
write_tsv(species_data,
paste0("data/mags_metadata/", filename, "_metadata.tsv"))
write_tsv(species_index,
paste0("data/mags_metadata/", filename, "_index.tsv"))
}### First only with two:
all_mags <- ehi_mags %>%
filter(completeness > 90,
contamination < 2.5,
species %in% c("s__Parabacteroides distasonis", "s__Phocaeicola vulgatus")) %>%
mutate(species = str_remove(species, "^s__") %>% str_to_lower() %>%
str_replace(" ", "_"),
out_path = paste0("data/", species, "/mags/", basename(MAG_url))
) %>%
dplyr::select(species, ID, MAG_url, out_path)
write_tsv(all_mags, "all_mags_index.tsv")