Chapter 5 Data Statistics
Total number of MAGs
# A tibble: 3 × 2
source n
<chr> <int>
1 EHI 38
2 GTDB 78
3 NCBI 13
# A tibble: 25 × 2
host_species n
<chr> <int>
1 Actinopterygii (Fish) 3
2 Barbastella Barbastellus 1
3 Bombyx Mori 1
4 Cabbage Kimchi 1
5 Canis Lupus 4
6 Cathartes Aura 4
7 Drosophila Melanogaster 1
8 Grasshopper 1
9 Hipposideros Ruber 2
10 Homo sapiens 42
# ℹ 15 more rows
# A tibble: 13 × 2
host_order n
<chr> <int>
1 Accipitriformes 4
2 Blattodea 1
3 Carnivora 4
4 Chiroptera 26
5 Cichliformes 1
6 Diptera 1
7 Lepidoptera 1
8 Orthoptera 1
9 Primates 43
10 Rodentia 36
11 Salmoniformes 1
12 Squamata 5
13 <NA> 5
# A tibble: 6 × 2
host_class n
<chr> <int>
1 Actinopterygii 5
2 Aves 4
3 Insecta 4
4 Mammalia 109
5 Reptilia 5
6 <NA> 2
5.0.2 Mean completeness and contamination
genome_metadata %>%
summarise(
mean_c = mean(completeness, na.rm = TRUE) %>% round(2),
sd_c = sd(completeness, na.rm = TRUE) %>% round(2),
mean_con = mean(contamination, na.rm = TRUE) %>% round(2),
sd_con = sd(contamination, na.rm = TRUE) %>% round(2)
) %>%
unite("Completeness", mean_c, sd_c, sep = " ± ") %>%
unite("Contamination", mean_con, sd_con, sep = " ± ") %>%
tt()| Completeness | Contamination |
|---|---|
| 99.12 ± 2.05 | 0.64 ± 0.61 |
#Generate quality biplot
genome_biplot <- genome_metadata %>%
dplyr::select(c(ID,completeness,contamination, host_order)) %>%
ggplot(aes(x=completeness,y=contamination, color = host_order)) +
scale_color_manual(values = host_order_colors, name = "Host Order")+
geom_point(alpha=0.7, size = 4) +
xlim(c(90,100)) +
ylim(c(2.5,0)) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "left",
axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
axis.title.x = element_text(size = 16, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold"))
#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
ggplot(aes(y=contamination)) +
ylim(c(2.5,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
ggplot(aes(x=completeness)) +
xlim(c(90,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_classic() +
theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))ggplot(genome_metadata, aes(x= ID, y = genome_size, fill = host_order))+
scale_fill_manual(values = host_order_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = genome_size, fill = source))+
scale_fill_manual(values = source_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Genome Size", x = "Source")genome_metadata %>%
filter(!is.na(host_class)) %>%
ggplot(aes(x= host_type, y = genome_size, fill = host_type))+
scale_fill_manual(values = host_type_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Genome Size", x = "Host Type")Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# A tibble: 1 × 45
ID source species gtdb_taxonomy host_species host_order host_class isolation_source host host_summary
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl>
1 EHM066683 EHI s__Lact… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
# genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
# mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
# env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
# disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>,
# host_age <lgl>, host_type <chr>, country_simple <chr>, country_normalized <chr>, continent <chr>
ggplot(genome_metadata, aes(x= ID, y = contamination, fill = source))+
scale_fill_manual(values = source_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = contamination, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
theme_classic()+
labs(y = "Contamination", x = "Source")ggplot(genome_metadata, aes(x= ID, y = completeness, fill = source))+
scale_fill_manual(values = source_colors)+
geom_col()+
theme_classic()ggplot(genome_metadata, aes(x= source, y = completeness, fill = source))+
scale_fill_manual(values = source_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Completeness", x = "Source")ggplot(genome_metadata, aes(x= host_type, y = completeness, fill = host_type))+
scale_fill_manual(values = host_type_colors)+
geom_violin()+
geom_point()+
theme_classic()+
labs(y = "Completeness", x = "Host Type")Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
genome_metadata_long <- genome_metadata %>%
filter(!is.na(host_class))%>%
mutate(genome_size_mb = genome_size/10^6)%>%
pivot_longer(
cols = c(genome_size_mb, contamination, completeness),
names_to = "metric",
values_to = "value"
) %>%
mutate(
metric = factor(metric,
levels = c("genome_size_mb","contamination" , "completeness"),
labels = c("Genome Size (Mb)", "Contamination (%)" , "Completeness (%)"))
)
faceted_plot <- ggplot(genome_metadata_long,
aes(x = source, y = value, fill = source)) +
scale_fill_manual(values = source_colors) +
geom_violin() +
geom_point() +
facet_wrap(~ metric, scales = "free_y", ncol = 2) +
theme_classic() +
theme(
strip.background = element_rect(fill = "white", color = "black"),
strip.text = element_text(face = "bold", size = 11)
) +
labs(y = "Value", x = "Source", fill = "Source")
print(faceted_plot)faceted_plot <- ggplot(genome_metadata_long,
aes(x = host_type, y = value, fill = host_type)) +
scale_fill_manual(values = host_type_colors) +
geom_violin() +
geom_point() +
facet_wrap(~ metric, scales = "free_y", ncol = 2) +
theme_classic() +
theme(
strip.background = element_rect(fill = "white", color = "black"),
strip.text = element_text(face = "bold", size = 11)
) +
labs(y = "Value", x = "Source", fill = "Host Type")
print(faceted_plot)Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 1738 0.895 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 1207 0.00207 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 1042 0.000189 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value parameter method
<dbl> <dbl> <int> <chr>
1 3.92 0.141 2 Kruskal-Wallis rank sum test
# A tibble: 1 × 4
statistic p.value parameter method
<dbl> <dbl> <int> <chr>
1 16.1 0.000314 2 Kruskal-Wallis rank sum test
# A tibble: 1 × 4
statistic p.value parameter method
<dbl> <dbl> <int> <chr>
1 5.75 0.0563 2 Kruskal-Wallis rank sum test
# Combine the summaries
ehi_stats <- genome_metadata %>%
filter(host_type == "animal") %>%
mutate(genome_size_mb = genome_size/1000000) %>%
summarise(
m_gs = round(mean(genome_size_mb), 2),
sd_gs = round(sd(genome_size_mb), 2),
m_cont = round(mean(contamination), 2),
sd_cont = round(sd(contamination), 2),
m_comp = round(mean(completeness), 2),
sd_comp = round(sd(completeness), 2)
) %>%
unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
mutate(Host_type = "animal") %>%
dplyr::select(Host_type, everything())
gtdb_stats <- genome_metadata %>%
filter(host_type == "human") %>%
filter(!is.na(genome_size)) %>%
mutate(genome_size_mb = genome_size/1000000) %>%
summarise(
m_gs = round(mean(genome_size_mb), 2),
sd_gs = round(sd(genome_size_mb), 2),
m_cont = round(mean(contamination), 2),
sd_cont = round(sd(contamination), 2),
m_comp = round(mean(completeness), 2),
sd_comp = round(sd(completeness), 2)
) %>%
unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
mutate(Host_type = "human") %>%
dplyr::select(Host_type, everything())
# Combine into one table
summary_table <- bind_rows(ehi_stats, gtdb_stats)
summary_table# A tibble: 2 × 4
Host_type `Mean genome size` `Mean contamination` `Mean completeness`
<chr> <chr> <chr> <chr>
1 animal 2.37 ± 0.24 0.66 ± 0.65 98.84 ± 2.37
2 human 2.46 ± 0.11 0.57 ± 0.5 99.66 ± 1
5.0.3 Country
Warning: package 'rnaturalearth' was built under R version 4.4.3
Warning: package 'rnaturalearthdata' was built under R version 4.4.3
Adjuntando el paquete: 'rnaturalearthdata'
The following object is masked from 'package:rnaturalearth':
countries110
Warning: package 'sf' was built under R version 4.4.3
Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
world <- ne_countries(scale = "medium", returnclass = "sf")
mag_locations <- genome_metadata %>%
filter(!is.na(country)) %>%
left_join(
world %>%
dplyr::select(name, geometry) %>%
st_centroid() %>%
st_coordinates() %>%
as_tibble() %>%
bind_cols(world %>% dplyr::select(name)) ,
by = c("country" = "name")
)Warning: st_centroid assumes attributes are constant over geometries
# A tibble: 127 × 3
ID host_type country
<chr> <chr> <chr>
1 EHM013277 animal France
2 EHM015868 animal France
3 EHM016594 animal Equatorial Guinea
4 EHM017294 animal Spain
5 EHM019886 animal Norway
6 EHM020035 animal Germany
7 EHM025059 animal France
8 EHM025679 animal France
9 EHM027656 animal United States
10 EHM028200 animal United States
# ℹ 117 more rows
ggplot(world) +
geom_sf(fill = "gray95", color = "gray80") +
geom_point(
data = mag_locations,
aes(x = X, y = Y, color = host_type),
size = 2.5,
alpha = 0.5
) +
scale_color_manual(values = host_type_colors) +
theme_minimal() +
labs(color = "Host Type")Warning: Removed 95 rows containing missing values or values outside the scale range (`geom_point()`).
plot_df <- genome_metadata %>%
dplyr::filter(!is.na(host_species), !is.na(host_class))
species_plot <- ggplot(plot_df, aes(x = host_species, fill = source)) +
geom_bar(position = "stack") +
facet_wrap(~ host_class, scales = "free_x") +
scale_fill_manual(values = source_colors, name = "Source") +
theme_bw(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
strip.text = element_text(size = 14, face = "bold")
) +
labs(
x = "Host species",
y = "Number of MAGs"
)
species_plotplot_df <- genome_metadata %>%
dplyr::filter(!is.na(country_normalized))%>%
dplyr::filter(country_normalized != "none")
library(stringr)
dist_plot <- ggplot(plot_df, aes(x = country_normalized, fill = host_order)) +
geom_bar(position = "stack") +
facet_grid(
~ continent,
scales = "free_x",
space = "free_x",
labeller = labeller(
continent = function(x) str_wrap(x, width = 10)
)
) +
scale_fill_manual(values = host_order_colors, name = "Host order") +
theme_bw(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
strip.text = element_text(size = 12, face = "bold")
) +
labs(
x = "Country",
y = "Number of MAGs"
)
dist_plot
#ggsave("./plots/dist_plot.png", dist_plot, dpi = 300, units = "cm", width = 30, height = 12)