Chapter 4 Data statistics
Total number of MAGs
# A tibble: 2 × 2
source n
<chr> <int>
1 EHI 25
2 GTDB 68
4.0.1 Mean completeness and contamination
genome_metadata %>%
summarise(
mean_c = mean(completeness, na.rm = TRUE) %>% round(2),
sd_c = sd(completeness, na.rm = TRUE) %>% round(2),
mean_con = mean(contamination, na.rm = TRUE) %>% round(2),
sd_con = sd(contamination, na.rm = TRUE) %>% round(2)
) %>%
unite("Completeness", mean_c, sd_c, sep = " ± ") %>%
unite("Contamination", mean_con, sd_con, sep = " ± ") %>%
tt()| Completeness | Contamination |
|---|---|
| 97.77 ± 2.6 | 1.17 ± 0.63 |
#Generate quality biplot
genome_biplot <- genome_metadata %>%
dplyr::select(c(ID,completeness,contamination, source)) %>%
ggplot(aes(x=completeness,y=contamination, color = source)) +
scale_color_manual(values = source_colors, name = "Source")+
geom_point(alpha=0.7, size = 4) +
xlim(c(90,100)) +
ylim(c(2.5,0)) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "left",
axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
axis.title.x = element_text(size = 16, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold"))
#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
ggplot(aes(y=contamination)) +
ylim(c(2.5,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
ggplot(aes(x=completeness)) +
xlim(c(90,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))ggplot(genome_metadata, aes(x= ID, y = genome_size, fill = source))+
scale_fill_manual(values = source_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = genome_size, fill = source))+
scale_fill_manual(values = source_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Genome Size", x = "Source")# A tibble: 1 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM020917 s__Parabacter… 90.3 1.73 3472909 45.6 11869 414 Strix aluco Strigifor… Aves Coassembly
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>
ggplot(genome_metadata, aes(x= ID, y = contamination, fill = source))+
scale_fill_manual(values = source_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = contamination, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
theme_classic()+
labs(y = "Contamination", x = "Source")ggplot(genome_metadata, aes(x= ID, y = completeness, fill = source))+
scale_fill_manual(values = source_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = completeness, fill = source))+
scale_fill_manual(values = source_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Completeness", x = "Source")genome_metadata_long <- genome_metadata %>%
mutate(genome_size_mb = genome_size/10^6)%>%
pivot_longer(
cols = c(genome_size_mb, contamination, completeness),
names_to = "metric",
values_to = "value"
) %>%
mutate(
metric = factor(metric,
levels = c("genome_size_mb","contamination" , "completeness"),
labels = c("Genome Size (Mb)", "Contamination (%)" , "Completeness (%)"))
)
faceted_plot <- ggplot(genome_metadata_long,
aes(x = source, y = value, fill = source)) +
scale_fill_manual(values = source_colors) +
geom_violin() +
geom_point() +
facet_wrap(~ metric, scales = "free_y", ncol = 2) +
theme_classic() +
theme(
strip.background = element_rect(fill = "white", color = "black"),
strip.text = element_text(face = "bold", size = 11)
) +
labs(y = "Value", x = "Source", fill = "Source")
print(faceted_plot)# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 1094. 0.0352 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 640 0.0600 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 348 0.0000139 Wilcoxon rank sum test with continuity correction two.sided
# Combine the summaries
ehi_stats <- genome_metadata %>%
filter(source == "EHI") %>%
mutate(genome_size_mb = genome_size/1000000) %>%
summarise(
m_gs = round(mean(genome_size_mb), 2),
sd_gs = round(sd(genome_size_mb), 2),
m_cont = round(mean(contamination), 2),
sd_cont = round(sd(contamination), 2),
m_comp = round(mean(completeness), 2),
sd_comp = round(sd(completeness), 2)
) %>%
unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
mutate(Source = "EHI") %>%
dplyr::select(Source, everything())
gtdb_stats <- genome_metadata %>%
filter(source == "GTDB") %>%
filter(!is.na(genome_size)) %>%
mutate(genome_size_mb = genome_size/1000000) %>%
summarise(
m_gs = round(mean(genome_size_mb), 2),
sd_gs = round(sd(genome_size_mb), 2),
m_cont = round(mean(contamination), 2),
sd_cont = round(sd(contamination), 2),
m_comp = round(mean(completeness), 2),
sd_comp = round(sd(completeness), 2)
) %>%
unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
mutate(Source = "GTDB") %>%
dplyr::select(Source, everything())
# Combine into one table
summary_table <- bind_rows(ehi_stats, gtdb_stats)
summary_table# A tibble: 2 × 4
Source `Mean genome size` `Mean contamination` `Mean completeness`
<chr> <chr> <chr> <chr>
1 EHI 4.36 ± 0.45 1.41 ± 0.63 96 ± 3.4
2 GTDB 4.84 ± 0.36 1.08 ± 0.61 98.42 ± 1.89
4.0.2 Country
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)
world <- ne_countries(scale = "medium", returnclass = "sf")
mag_locations <- genome_metadata %>%
filter(!is.na(country)) %>%
left_join(
world %>%
dplyr::select(name, geometry) %>%
st_centroid() %>%
st_coordinates() %>%
as_tibble() %>%
bind_cols(world %>% dplyr::select(name)) ,
by = c("country" = "name")
)Warning: st_centroid assumes attributes are constant over geometries
# A tibble: 91 × 28
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM016228 s__Parabact… 97.3 0.92 4815605 45 32654 224 Strix aluco Strigifor… Aves Individual
2 EHM016318 s__Parabact… 93.6 1.81 4085923 45.3 7435 685 Strix aluco Strigifor… Aves Individual
3 EHM016582 s__Parabact… 92.2 1.17 3706610 45.7 7507 618 Strix aluco Strigifor… Aves Individual
4 EHM016991 s__Parabact… 99.0 0.38 4748219 44.9 117248 76 Podarcis pi… Squamata Reptilia Individual
5 EHM020917 s__Parabact… 90.3 1.73 3472909 45.6 11869 414 Strix aluco Strigifor… Aves Coassembly
6 EHM023585 s__Parabact… 92.8 2.18 3889519 45.6 8649 607 Sciurus vul… Rodentia Mammalia Coassembly
7 EHM023781 s__Parabact… 93.5 1.92 4159834 45.6 16836 369 Podarcis ga… Squamata Reptilia Coassembly
8 EHM027637 s__Parabact… 93.5 2.05 4160022 45.3 12792 526 Cathartes a… Accipitri… Aves Coassembly
9 EHM028668 s__Parabact… 100.0 0.87 4684324 45 73796 107 Timon lepid… Squamata Reptilia Individual
10 EHM029564 s__Parabact… 95.9 1.5 4476906 45 41913 152 Timon lepid… Squamata Reptilia Coassembly
# ℹ 81 more rows
# ℹ 16 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>, X <dbl>, Y <dbl>, geometry <MULTIPOLYGON [°]>
ggplot(world) +
geom_sf(fill = "gray95", color = "gray80") +
geom_point(
data = mag_locations,
aes(x = X, y = Y, color = source),
size = 2.5,
alpha = 0.5
) +
scale_color_manual(values = source_colors) +
theme_minimal() +
labs(color = "Source")Warning: Removed 34 rows containing missing values or values outside the scale range (`geom_point()`).
plot_df <- genome_metadata %>%
dplyr::filter(!is.na(host_species), !is.na(host_class))
species_plot <- ggplot(plot_df, aes(x = host_species, fill = source)) +
geom_bar(position = "stack") +
facet_wrap(~ host_class, scales = "free_x") +
scale_fill_manual(values = source_colors, name = "Source") +
theme_bw(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
strip.text = element_text(size = 14, face = "bold")
) +
labs(
x = "Host species",
y = "Number of MAGs"
)
ggsave("./plots/species_plot.png", species_plot, dpi = 300, units = "cm", width = 18, height = 10)plot_df <- genome_metadata %>%
dplyr::filter(!is.na(country))%>%
dplyr::filter(country != "none")
library(stringr)
dist_plot <- ggplot(plot_df, aes(x = country, fill = source)) +
geom_bar(position = "stack") +
facet_grid(
~ continent,
scales = "free_x",
space = "free_x",
labeller = labeller(
continent = function(x) str_wrap(x, width = 10)
)
) +
scale_fill_manual(values = source_colors, name = "Source") +
theme_bw(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
strip.text = element_text(size = 12, face = "bold")
) +
labs(
x = "Country",
y = "Number of MAGs"
)
ggsave("./plots/dist_plot.png", dist_plot, dpi = 300, units = "cm", width = 30, height = 12)