Chapter 5 Data Statistics

Total number of MAGs

genome_metadata %>%
  dplyr::count(source)
# A tibble: 3 × 2
  source     n
  <chr>  <int>
1 EHI       38
2 GTDB      78
3 NCBI      13
genome_metadata %>% dplyr::count(host_species)
# A tibble: 25 × 2
   host_species                 n
   <chr>                    <int>
 1 Actinopterygii (Fish)        3
 2 Barbastella Barbastellus     1
 3 Bombyx Mori                  1
 4 Cabbage Kimchi               1
 5 Canis Lupus                  4
 6 Cathartes Aura               4
 7 Drosophila Melanogaster      1
 8 Grasshopper                  1
 9 Hipposideros Ruber           2
10 Homo sapiens                42
# ℹ 15 more rows
genome_metadata %>% dplyr::count(host_order)
# A tibble: 13 × 2
   host_order          n
   <chr>           <int>
 1 Accipitriformes     4
 2 Blattodea           1
 3 Carnivora           4
 4 Chiroptera         26
 5 Cichliformes        1
 6 Diptera             1
 7 Lepidoptera         1
 8 Orthoptera          1
 9 Primates           43
10 Rodentia           36
11 Salmoniformes       1
12 Squamata            5
13 <NA>                5
genome_metadata %>% dplyr::count(host_class)
# A tibble: 6 × 2
  host_class         n
  <chr>          <int>
1 Actinopterygii     5
2 Aves               4
3 Insecta            4
4 Mammalia         109
5 Reptilia           5
6 <NA>               2

5.0.1 Remove the MAGs with no host metadata

genome_metadata <- genome_metadata %>%
  filter(!is.na(host_class))

5.0.2 Mean completeness and contamination

genome_metadata %>% 
  summarise(
    mean_c = mean(completeness, na.rm = TRUE) %>% round(2),
    sd_c = sd(completeness, na.rm = TRUE) %>% round(2),
    mean_con = mean(contamination, na.rm = TRUE) %>% round(2),
    sd_con = sd(contamination, na.rm = TRUE) %>% round(2)
  ) %>%
  unite("Completeness", mean_c, sd_c, sep = " ± ") %>%
  unite("Contamination", mean_con, sd_con, sep = " ± ") %>%
  tt()
Completeness Contamination
99.12 ± 2.05 0.64 ± 0.61
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  dplyr::select(c(ID,completeness,contamination, host_order)) %>%
  ggplot(aes(x=completeness,y=contamination, color = host_order)) +
  scale_color_manual(values = host_order_colors, name = "Host Order")+
              geom_point(alpha=0.7, size = 4) +
                    xlim(c(90,100)) +
                    ylim(c(2.5,0)) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "left",
                          axis.text.y = element_text(size=12),
                          axis.text.x = element_text(size=12),
                          axis.title.x = element_text(size = 16, face = "bold"),
                          axis.title.y = element_text(size = 16, face = "bold"))

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(2.5,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(90,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

#dev.off()
ggplot(genome_metadata, aes(x= ID, y = genome_size, fill = host_order))+
  scale_fill_manual(values = host_order_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = genome_size, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_violin()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Genome Size", x = "Source")

genome_metadata %>%
  filter(!is.na(host_class)) %>% 
ggplot(aes(x= host_type, y = genome_size, fill = host_type))+
  scale_fill_manual(values = host_type_colors)+
  geom_violin()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Genome Size", x = "Host Type")
Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

genome_metadata%>%
  filter(genome_size == min(genome_size))
# A tibble: 1 × 45
  ID        source species  gtdb_taxonomy host_species host_order host_class isolation_source host  host_summary
  <chr>     <chr>  <chr>    <chr>         <chr>        <chr>      <chr>      <chr>            <chr> <lgl>       
1 EHM066683 EHI    s__Lact… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
#   genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
#   mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
#   env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
#   disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>,
#   host_age <lgl>, host_type <chr>, country_simple <chr>, country_normalized <chr>, continent <chr>
ggplot(genome_metadata, aes(x= ID, y = contamination, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = contamination, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_boxplot()+ 
  theme_classic()+
  labs(y = "Contamination", x = "Source")

ggplot(genome_metadata, aes(x= ID, y = completeness, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = completeness, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_violin()+
 geom_point()+
  theme_classic()+
  labs(y = "Completeness", x = "Source")

ggplot(genome_metadata, aes(x= host_type, y = completeness, fill = host_type))+
  scale_fill_manual(values = host_type_colors)+
  geom_violin()+
 geom_point()+
  theme_classic()+
  labs(y = "Completeness", x = "Host Type")
Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

genome_metadata_long <- genome_metadata %>%
  filter(!is.na(host_class))%>%
  mutate(genome_size_mb = genome_size/10^6)%>%
  pivot_longer(
    cols = c(genome_size_mb, contamination, completeness),
    names_to = "metric",
    values_to = "value"
  ) %>%
  mutate(
    metric = factor(metric, 
                   levels = c("genome_size_mb","contamination" , "completeness"),
                   labels = c("Genome Size (Mb)", "Contamination (%)" , "Completeness (%)"))
  )

faceted_plot <- ggplot(genome_metadata_long, 
                       aes(x = source, y = value, fill = source)) +
  scale_fill_manual(values = source_colors) +
  geom_violin() +
  geom_point() +
  facet_wrap(~ metric, scales = "free_y", ncol = 2) +
  theme_classic() +
  theme(
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11)
  ) +
  labs(y = "Value", x = "Source", fill = "Source")

print(faceted_plot)

faceted_plot <- ggplot(genome_metadata_long, 
                       aes(x = host_type, y = value, fill = host_type)) +
  scale_fill_manual(values = host_type_colors) +
  geom_violin() +
  geom_point() +
  facet_wrap(~ metric, scales = "free_y", ncol = 2) +
  theme_classic() +
  theme(
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11)
  ) +
  labs(y = "Value", x = "Source", fill = "Host Type")

print(faceted_plot)
Warning: Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
Groups with fewer than two datapoints have been dropped.
ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

#Comparing animal vs human
wilcox.test(contamination ~ host_type, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1      1738   0.895 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(completeness ~ host_type, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1      1207 0.00207 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(genome_size ~ host_type, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic  p.value method                                            alternative
      <dbl>    <dbl> <chr>                                             <chr>      
1      1042 0.000189 Wilcoxon rank sum test with continuity correction two.sided  
#Comparing sources
kruskal.test(contamination ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value parameter method                      
      <dbl>   <dbl>     <int> <chr>                       
1      3.92   0.141         2 Kruskal-Wallis rank sum test
kruskal.test(completeness ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic  p.value parameter method                      
      <dbl>    <dbl>     <int> <chr>                       
1      16.1 0.000314         2 Kruskal-Wallis rank sum test
kruskal.test(genome_size ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value parameter method                      
      <dbl>   <dbl>     <int> <chr>                       
1      5.75  0.0563         2 Kruskal-Wallis rank sum test
# Combine the summaries
ehi_stats <- genome_metadata %>%
  filter(host_type == "animal") %>%
  mutate(genome_size_mb = genome_size/1000000) %>%
  summarise(
    m_gs = round(mean(genome_size_mb), 2),
    sd_gs = round(sd(genome_size_mb), 2), 
    m_cont = round(mean(contamination), 2),
    sd_cont = round(sd(contamination), 2),
    m_comp = round(mean(completeness), 2),
    sd_comp = round(sd(completeness), 2)
  ) %>%
  unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
  unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
  unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
  mutate(Host_type = "animal") %>%
  dplyr::select(Host_type, everything())

gtdb_stats <- genome_metadata %>%
  filter(host_type == "human") %>%
  filter(!is.na(genome_size)) %>%
  mutate(genome_size_mb = genome_size/1000000) %>%
  summarise(
    m_gs = round(mean(genome_size_mb), 2),
    sd_gs = round(sd(genome_size_mb), 2),
    m_cont = round(mean(contamination), 2),
    sd_cont = round(sd(contamination), 2),
    m_comp = round(mean(completeness), 2),
    sd_comp = round(sd(completeness), 2)
  ) %>%
   unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
  unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
  unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
  mutate(Host_type = "human") %>%
  dplyr::select(Host_type, everything())

# Combine into one table
summary_table <- bind_rows(ehi_stats, gtdb_stats)

summary_table
# A tibble: 2 × 4
  Host_type `Mean genome size` `Mean contamination` `Mean completeness`
  <chr>     <chr>              <chr>                <chr>              
1 animal    2.37 ± 0.24        0.66 ± 0.65          98.84 ± 2.37       
2 human     2.46 ± 0.11        0.57 ± 0.5           99.66 ± 1          

5.0.3 Country

library(rnaturalearth)
Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
Warning: package 'rnaturalearthdata' was built under R version 4.4.3

Adjuntando el paquete: 'rnaturalearthdata'
The following object is masked from 'package:rnaturalearth':

    countries110
library(sf) 
Warning: package 'sf' was built under R version 4.4.3
Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
world <- ne_countries(scale = "medium", returnclass = "sf")

mag_locations <- genome_metadata %>%
  filter(!is.na(country)) %>%
  left_join(
    world %>%
      dplyr::select(name, geometry) %>%
      st_centroid() %>%
      st_coordinates() %>%
      as_tibble() %>%
      bind_cols(world %>% dplyr::select(name)) ,
    by = c("country" = "name")
  )
Warning: st_centroid assumes attributes are constant over geometries
mag_locations%>% dplyr::select(ID, host_type, country)
# A tibble: 127 × 3
   ID        host_type country          
   <chr>     <chr>     <chr>            
 1 EHM013277 animal    France           
 2 EHM015868 animal    France           
 3 EHM016594 animal    Equatorial Guinea
 4 EHM017294 animal    Spain            
 5 EHM019886 animal    Norway           
 6 EHM020035 animal    Germany          
 7 EHM025059 animal    France           
 8 EHM025679 animal    France           
 9 EHM027656 animal    United States    
10 EHM028200 animal    United States    
# ℹ 117 more rows
ggplot(world) +
  geom_sf(fill = "gray95", color = "gray80") +
  geom_point(
    data = mag_locations,
    aes(x = X, y = Y, color = host_type),
    size = 2.5,
    alpha = 0.5
  ) +
  scale_color_manual(values = host_type_colors) +
  theme_minimal() +
  labs(color = "Host Type")
Warning: Removed 95 rows containing missing values or values outside the scale range (`geom_point()`).

plot_df <- genome_metadata %>%
  dplyr::filter(!is.na(host_species), !is.na(host_class))

species_plot <- ggplot(plot_df, aes(x = host_species, fill = source)) +
  geom_bar(position = "stack") +
  facet_wrap(~ host_class, scales = "free_x") +
  scale_fill_manual(values = source_colors, name = "Source") +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_text(size = 16),
    axis.title.y = element_text(size = 16),
    strip.text = element_text(size = 14, face = "bold")
  ) +
  labs(
    x = "Host species",
    y = "Number of MAGs"
  )

species_plot

#ggsave("./plots/species_plot.png", species_plot, dpi = 300, units = "cm", width = 18, height = 10)
plot_df <- genome_metadata %>%
  dplyr::filter(!is.na(country_normalized))%>%
  dplyr::filter(country_normalized != "none")

library(stringr)

dist_plot <- ggplot(plot_df, aes(x = country_normalized, fill = host_order)) +
  geom_bar(position = "stack") +
  facet_grid(
    ~ continent,
    scales = "free_x",
    space  = "free_x",
    labeller = labeller(
      continent = function(x) str_wrap(x, width = 10)
    )
  ) +
  scale_fill_manual(values = host_order_colors, name = "Host order") +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_text(size = 16),
    axis.title.y = element_text(size = 16),
    strip.text = element_text(size = 12, face = "bold")
  ) +
  labs(
    x = "Country",
    y = "Number of MAGs"
  )

dist_plot
#ggsave("./plots/dist_plot.png", dist_plot, dpi = 300, units = "cm", width = 30, height = 12)