Chapter 4 Data statistics

load("data/data.Rdata")

Total number of MAGs

genome_metadata %>%
  dplyr::count(source)
# A tibble: 2 × 2
  source     n
  <chr>  <int>
1 EHI       25
2 GTDB      68

4.0.1 Mean completeness and contamination

genome_metadata %>% 
  summarise(
    mean_c = mean(completeness, na.rm = TRUE) %>% round(2),
    sd_c = sd(completeness, na.rm = TRUE) %>% round(2),
    mean_con = mean(contamination, na.rm = TRUE) %>% round(2),
    sd_con = sd(contamination, na.rm = TRUE) %>% round(2)
  ) %>%
  unite("Completeness", mean_c, sd_c, sep = " ± ") %>%
  unite("Contamination", mean_con, sd_con, sep = " ± ") %>%
  tt()
Completeness Contamination
97.77 ± 2.6 1.17 ± 0.63
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  dplyr::select(c(ID,completeness,contamination, source)) %>%
  ggplot(aes(x=completeness,y=contamination, color = source)) +
  scale_color_manual(values = source_colors, name = "Source")+
              geom_point(alpha=0.7, size = 4) +
                    xlim(c(90,100)) +
                    ylim(c(2.5,0)) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "left",
                          axis.text.y = element_text(size=12),
                          axis.text.x = element_text(size=12),
                          axis.title.x = element_text(size = 16, face = "bold"),
                          axis.title.y = element_text(size = 16, face = "bold"))

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(2.5,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_classic() +
                    theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(90,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_classic() +
                theme(legend.position = "none",
                    axis.line = element_blank(),
                    axis.title = element_blank(),
                    axis.text=element_blank(),
                    axis.ticks=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
#pdf("figures/completeness_contamination.pdf",width=10, height=5)
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

#dev.off()
ggplot(genome_metadata, aes(x= ID, y = genome_size, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = genome_size, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_violin()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Genome Size", x = "Source")

genome_metadata%>%
  filter(genome_size == min(genome_size))
# A tibble: 1 × 25
  ID        species        completeness contamination genome_size    GC   N50 contigs host_species host_order host_class assembly_type
  <chr>     <chr>                 <dbl>         <dbl>       <dbl> <dbl> <dbl>   <dbl> <chr>        <chr>      <chr>      <chr>        
1 EHM020917 s__Parabacter…         90.3          1.73     3472909  45.6 11869     414 Strix aluco  Strigifor… Aves       Coassembly   
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
#   source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
#   ncbi_date <date>, continent <chr>
ggplot(genome_metadata, aes(x= ID, y = contamination, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = contamination, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_boxplot()+ 
  theme_classic()+
  labs(y = "Contamination", x = "Source")

ggplot(genome_metadata, aes(x= ID, y = completeness, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_col()+ 
  theme_classic()

ggplot(genome_metadata, aes(x= source, y = completeness, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_violin()+
 geom_point()+
  theme_classic()+
  labs(y = "Completeness", x = "Source")

genome_metadata_long <- genome_metadata %>%
  mutate(genome_size_mb = genome_size/10^6)%>%
  pivot_longer(
    cols = c(genome_size_mb, contamination, completeness),
    names_to = "metric",
    values_to = "value"
  ) %>%
  mutate(
    metric = factor(metric, 
                   levels = c("genome_size_mb","contamination" , "completeness"),
                   labels = c("Genome Size (Mb)", "Contamination (%)" , "Completeness (%)"))
  )

faceted_plot <- ggplot(genome_metadata_long, 
                       aes(x = source, y = value, fill = source)) +
  scale_fill_manual(values = source_colors) +
  geom_violin() +
  geom_point() +
  facet_wrap(~ metric, scales = "free_y", ncol = 2) +
  theme_classic() +
  theme(
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11)
  ) +
  labs(y = "Value", x = "Source", fill = "Source")

print(faceted_plot)

wilcox.test(contamination ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1     1094.  0.0352 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(completeness ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1       640  0.0600 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(genome_size ~ source, data=genome_metadata)  %>%
  tidy()
# A tibble: 1 × 4
  statistic   p.value method                                            alternative
      <dbl>     <dbl> <chr>                                             <chr>      
1       348 0.0000139 Wilcoxon rank sum test with continuity correction two.sided  
# Combine the summaries
ehi_stats <- genome_metadata %>%
  filter(source == "EHI") %>%
  mutate(genome_size_mb = genome_size/1000000) %>%
  summarise(
    m_gs = round(mean(genome_size_mb), 2),
    sd_gs = round(sd(genome_size_mb), 2), 
    m_cont = round(mean(contamination), 2),
    sd_cont = round(sd(contamination), 2),
    m_comp = round(mean(completeness), 2),
    sd_comp = round(sd(completeness), 2)
  ) %>%
  unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
  unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
  unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
  mutate(Source = "EHI") %>%
  dplyr::select(Source, everything())

gtdb_stats <- genome_metadata %>%
  filter(source == "GTDB") %>%
  filter(!is.na(genome_size)) %>%
  mutate(genome_size_mb = genome_size/1000000) %>%
  summarise(
    m_gs = round(mean(genome_size_mb), 2),
    sd_gs = round(sd(genome_size_mb), 2),
    m_cont = round(mean(contamination), 2),
    sd_cont = round(sd(contamination), 2),
    m_comp = round(mean(completeness), 2),
    sd_comp = round(sd(completeness), 2)
  ) %>%
   unite("Mean genome size",m_gs, sd_gs, sep = " ± ", remove = TRUE) %>%
  unite("Mean completeness",m_comp, sd_comp, sep = " ± ", remove = TRUE) %>%
  unite("Mean contamination",m_cont, sd_cont, sep = " ± ", remove = TRUE) %>%
  mutate(Source = "GTDB") %>%
  dplyr::select(Source, everything())

# Combine into one table
summary_table <- bind_rows(ehi_stats, gtdb_stats)

summary_table
# A tibble: 2 × 4
  Source `Mean genome size` `Mean contamination` `Mean completeness`
  <chr>  <chr>              <chr>                <chr>              
1 EHI    4.36 ± 0.45        1.41 ± 0.63          96 ± 3.4           
2 GTDB   4.84 ± 0.36        1.08 ± 0.61          98.42 ± 1.89       

4.0.2 Country

library(rnaturalearth)
library(rnaturalearthdata)
library(sf) 


world <- ne_countries(scale = "medium", returnclass = "sf")

mag_locations <- genome_metadata %>%
  filter(!is.na(country)) %>%
  left_join(
    world %>%
      dplyr::select(name, geometry) %>%
      st_centroid() %>%
      st_coordinates() %>%
      as_tibble() %>%
      bind_cols(world %>% dplyr::select(name)) ,
    by = c("country" = "name")
  )
Warning: st_centroid assumes attributes are constant over geometries
mag_locations
# A tibble: 91 × 28
   ID        species      completeness contamination genome_size    GC    N50 contigs host_species host_order host_class assembly_type
   <chr>     <chr>               <dbl>         <dbl>       <dbl> <dbl>  <dbl>   <dbl> <chr>        <chr>      <chr>      <chr>        
 1 EHM016228 s__Parabact…         97.3          0.92     4815605  45    32654     224 Strix aluco  Strigifor… Aves       Individual   
 2 EHM016318 s__Parabact…         93.6          1.81     4085923  45.3   7435     685 Strix aluco  Strigifor… Aves       Individual   
 3 EHM016582 s__Parabact…         92.2          1.17     3706610  45.7   7507     618 Strix aluco  Strigifor… Aves       Individual   
 4 EHM016991 s__Parabact…         99.0          0.38     4748219  44.9 117248      76 Podarcis pi… Squamata   Reptilia   Individual   
 5 EHM020917 s__Parabact…         90.3          1.73     3472909  45.6  11869     414 Strix aluco  Strigifor… Aves       Coassembly   
 6 EHM023585 s__Parabact…         92.8          2.18     3889519  45.6   8649     607 Sciurus vul… Rodentia   Mammalia   Coassembly   
 7 EHM023781 s__Parabact…         93.5          1.92     4159834  45.6  16836     369 Podarcis ga… Squamata   Reptilia   Coassembly   
 8 EHM027637 s__Parabact…         93.5          2.05     4160022  45.3  12792     526 Cathartes a… Accipitri… Aves       Coassembly   
 9 EHM028668 s__Parabact…        100.0          0.87     4684324  45    73796     107 Timon lepid… Squamata   Reptilia   Individual   
10 EHM029564 s__Parabact…         95.9          1.5      4476906  45    41913     152 Timon lepid… Squamata   Reptilia   Coassembly   
# ℹ 81 more rows
# ℹ 16 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
#   source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
#   ncbi_date <date>, continent <chr>, X <dbl>, Y <dbl>, geometry <MULTIPOLYGON [°]>
ggplot(world) +
  geom_sf(fill = "gray95", color = "gray80") +
  geom_point(
    data = mag_locations,
    aes(x = X, y = Y, color = source),
    size = 2.5,
    alpha = 0.5
  ) +
  scale_color_manual(values = source_colors) +
  theme_minimal() +
  labs(color = "Source")
Warning: Removed 34 rows containing missing values or values outside the scale range (`geom_point()`).

plot_df <- genome_metadata %>%
  dplyr::filter(!is.na(host_species), !is.na(host_class))

species_plot <- ggplot(plot_df, aes(x = host_species, fill = source)) +
  geom_bar(position = "stack") +
  facet_wrap(~ host_class, scales = "free_x") +
  scale_fill_manual(values = source_colors, name = "Source") +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_text(size = 16),
    axis.title.y = element_text(size = 16),
    strip.text = element_text(size = 14, face = "bold")
  ) +
  labs(
    x = "Host species",
    y = "Number of MAGs"
  )

ggsave("./plots/species_plot.png", species_plot, dpi = 300, units = "cm", width = 18, height = 10)
plot_df <- genome_metadata %>%
  dplyr::filter(!is.na(country))%>%
  dplyr::filter(country != "none")

library(stringr)

dist_plot <- ggplot(plot_df, aes(x = country, fill = source)) +
  geom_bar(position = "stack") +
  facet_grid(
    ~ continent,
    scales = "free_x",
    space  = "free_x",
    labeller = labeller(
      continent = function(x) str_wrap(x, width = 10)
    )
  ) +
  scale_fill_manual(values = source_colors, name = "Source") +
  theme_bw(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_text(size = 16),
    axis.title.y = element_text(size = 16),
    strip.text = element_text(size = 12, face = "bold")
  ) +
  labs(
    x = "Country",
    y = "Number of MAGs"
  )

ggsave("./plots/dist_plot.png", dist_plot, dpi = 300, units = "cm", width = 30, height = 12)