Chapter 4 NCBI MAGs exploration

4.1 NCBI MAGs metadata

gut_keywords <- c("ceaca", "caeca", "excrement", "excrements", "rectal", "feces", 
                  "stool", "stools", "gut", "cloaca", "cloacal", "fecal", "intestines",
                  "intestine", "intestinal", "cecum", "ileum", "jejunum", "colon", "rectum", "rumen",
                  "anal", "anus", "manure", "faeces")

# Wrap each keyword in word boundaries: \bkeyword\b --otherwise artisanal passes the filter 
pattern <- paste0("\\b(", paste(gut_keywords, collapse = "|"), ")\\b")


# Filter the data
#gut_samples <- metadata_wide %>%
#  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

Download metadata for each taxa

# Trying to download ncbi metadata for mags

conda activate drakkar_env
conda install -c conda-forge ncbi-datasets-cli



## CITROBACTER BRAAKII
datasets summary genome taxon 57706 \
  --as-json-lines > genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' genomes.jsonl > high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
  .accession as $asm
  | .checkm_info.completeness as $comp
  | .checkm_info.contamination as $cont
  | .assembly_stats.total_sequence_length as $genome_size
  | .assembly_stats.gc_percent as $gc
  | .assembly_info.biosample.attributes[]?
  | [$asm, $comp, $cont, $genome_size, $gc, .name, .value]
  | @tsv
' high_quality_genomes.jsonl > citrobacter_biosample_attributes.tsv

4.1.1 Citrobacter braakii

c_braki_metadata <- read_tsv("data/mags_metadata/citrobacter_braakii_biosample_attributes.tsv",
  col_names = c(
    "accession",
    "completeness",
    "contamination",
    "genome_size",
    "gc_percent",
    "name",
    "value"
  )
)

wide <- c_braki_metadata %>%
  pivot_wider(
    id_cols = c(
      accession,
      completeness,
      contamination,
      genome_size,
      gc_percent
    ),
    names_from  = name,
    values_from = value
  )


source_counts <- wide %>%
  filter(!is.na(isolation_source)) %>%
  dplyr::count(isolation_source, sort = TRUE)

source_counts


host_counts <- wide %>%
  filter(!is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

host_counts

wide %>%
  filter(host == "Homo sapiens")

citrobacter_gut <- wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

citrobacter_gut %>%tt()

## HUMAN
summary_counts <- citrobacter_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- citrobacter_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.2 Lactococcus lactis

## LACTOCOCCUS LACTIS
datasets summary genome taxon 1358 \
  --as-json-lines > lactococcus_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' lactococcus_genomes.jsonl > lactococcus_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
  .accession as $asm
  | .checkm_info.completeness as $comp
  | .checkm_info.contamination as $cont
  | .assembly_stats.total_sequence_length as $genome_size
  | .assembly_stats.gc_percent as $gc
  | .assembly_info.biosample.attributes[]?
  | [$asm, $comp, $cont, $genome_size, $gc, .name, .value]
  | @tsv
' lactococcus_high_quality_genomes.jsonl > lactococcus_biosample_attributes.tsv
lactococcus_metadata <- read_tsv("data/mags_metadata/lactococcus_lactis_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- lactococcus_metadata %>%
  filter(attribute != "assembly") %>%  # assembly is in some attribute column, causing issues
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = list(value = ~ paste(.x, collapse = "; "))
  )


lactococcus_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- lactococcus_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- lactococcus_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.3 Hafnia paralvei

## HAFNIA PARALVEI
datasets summary genome taxon 546367 \
  --as-json-lines > hafnia_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' hafnia_genomes.jsonl > hafnia_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' hafnia_high_quality_genomes.jsonl > hafnia_biosample_attributes.tsv
hafnia_metadata <- read_tsv("data/ncbi/hafnia_paralvei_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- hafnia_metadata %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )

hafnia_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- hafnia_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- hafnia_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.4 Enterococcus faecalis

## ENTEROCOCCUS FAECALIS
datasets summary genome taxon 1351 \
  --as-json-lines > enterococcus_faecalis_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' enterococcus_faecalis_genomes.jsonl > enterococcus_faecalis_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' enterococcus_faecalis_high_quality_genomes.jsonl > enterococcus_faecalis_biosample_attributes.tsv
enterococcus_faecalis_metadata <- read_tsv("data/ncbi/enterococcus_faecalis_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- enterococcus_faecalis_metadata %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


enterococcus_faecalis_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- enterococcus_faecalis_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- enterococcus_faecalis_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.5 Enterococcus hirae

## ENTEROCOCCUS HIRAE
datasets summary genome taxon 1354 \
  --as-json-lines > enterococcus_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' enterococcus_genomes.jsonl > enterococcus_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' enterococcus_high_quality_genomes.jsonl > enterococcus_biosample_attributes.tsv
enterococcus_metadata <- read_tsv("data/ncbi/enterococcus_hirae_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- enterococcus_metadata %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


enterococcus_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- enterococcus_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- enterococcus_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.6 Bacteroides uniformis

## BACTEROIDES UNIFORMIS
datasets summary genome taxon 820 \
  --as-json-lines > b_uniformis_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' b_uniformis_genomes.jsonl > b_uniformis_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' b_uniformis_high_quality_genomes.jsonl > b_uniformis_biosample_attributes.tsv
b_uniformis_metadata <- read_tsv("data/ncbi/bacteroides_uniformis_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- b_uniformis_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


b_uniformis_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- b_uniformis_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- b_uniformis_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.7 Phocaeicola vulgatus

## PHOCAEICOLA VULGATUS
datasets summary genome taxon 821 \
  --as-json-lines > p_vulgatus_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' p_vulgatus_genomes.jsonl > p_vulgatus_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' p_vulgatus_high_quality_genomes.jsonl > p_vulgatus_biosample_attributes.tsv
p_vulgatus_metadata <- read_tsv("data/ncbi/p_vulgatus_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- p_vulgatus_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


p_vulgatus_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- p_vulgatus_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- p_vulgatus_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.8 Parabacteroides goldsteinii

## PARABACTEROIDES GOLDSTEINII
datasets summary genome taxon 328812 \
  --as-json-lines > p_goldsteinii_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' p_goldsteinii_genomes.jsonl > p_goldsteinii_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' p_goldsteinii_high_quality_genomes.jsonl > p_goldsteinii_biosample_attributes.tsv
p_goldsteinii_metadata <- read_tsv("data/ncbi/parabacteroides_goldsteinii_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- p_goldsteinii_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


p_goldsteinii_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- p_goldsteinii_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- p_goldsteinii_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.9 Akkermansia municiphila

## AKKERMANSIA MUNICIPHILA
datasets summary genome taxon 239935 \
  --as-json-lines > akkermansia_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' akkermansia_genomes.jsonl > akkermansia_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' akkermansia_high_quality_genomes.jsonl > akkermansia_biosample_attributes.tsv
akkermansia_metadata <- read_tsv("data/ncbi/akkermansia_muciniphila_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- akkermansia_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


akkermansia_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- akkermansia_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- akkermansia_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.10 Bacteroides fragilis

## BACTEROIDES FRAGILIS
datasets summary genome taxon 817 \
  --as-json-lines > b_fragilis_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' b_fragilis_genomes.jsonl > b_fragilis_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' b_fragilis_high_quality_genomes.jsonl > b_fragilis_biosample_attributes.tsv
b_fragilis_metadata <- read_tsv("data/ncbi/bacteroides_fragilis_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- b_fragilis_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


b_fragilis_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- b_fragilis_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- b_fragilis_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)

4.1.11 Providencia rettgeri

## PROVIDENCIA RETTGERI
datasets summary genome taxon 587 \
  --as-json-lines > providencia_genomes.jsonl


#filtering by contamination and completeness
jq -c '
select(
  .checkm_info.completeness >= 90 and
  .checkm_info.contamination <= 2.5
)
' providencia_genomes.jsonl > providencia_high_quality_genomes.jsonl



#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' providencia_high_quality_genomes.jsonl > providencia_biosample_attributes.tsv
providencia_metadata <- read_tsv("data/ncbi/providencia_rettgeri_biosample_attributes.tsv",  col_names = c("assembly", "attribute", "value"))


metadata_wide <- providencia_metadata %>%
  filter(attribute != "assembly") %>%
  pivot_wider(
    id_cols = assembly,
    names_from = attribute,
    values_from = value,
    values_fn = function(x) x[1]
  )


providencia_gut <- metadata_wide %>%
  filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))

## HUMAN
summary_counts <- providencia_gut %>%
  dplyr::summarize(
    total_rows = n(),
    human_samples = sum(
      host == "Homo sapiens" | 
      stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)), 
      na.rm = TRUE
    )
  )

print(summary_counts)

## OTHER HOSTS
other_hosts <- providencia_gut %>%
  dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
  dplyr::count(host, sort = TRUE)

print(other_hosts)
species <- c(
  "lactococcus_lactis",
  "hafnia_paralvei",
  "enterococcus_faecalis",
  "enterococcus_hirae",
  "bacteroides_uniformis",
  "phocaeicola_vulgatus",
  "parabacteroides_goldsteinii",
  "akkermansia_muciniphila",
  "bacteroides_fragilis",
  "providencia_rettgeri",
  "citrobacter_braakii"
)

for (sp in species) {

  message("Processing ", sp)

  infile <- file.path(
    "data/mags_metadata",
    paste0(sp, "_biosample_attributes.tsv")
  )

  metadata_long <- read_tsv(
    infile,
    col_names = c(
      "accession",
      "completeness",
      "contamination",
      "genome_size",
      "gc_percent",
      "name",
      "value"
    ),
    show_col_types = FALSE
  )

  metadata_wide <- metadata_long %>%
    pivot_wider(
      id_cols = c(
        accession,
        completeness,
        contamination,
        genome_size,
        gc_percent
      ),
      names_from  = name,
      values_from = value,
      values_fn   = ~ dplyr::first(na.omit(.x)),
      names_repair = "unique"
    )

  ## filter gut-associated genomes
  if ("isolation_source" %in% colnames(metadata_wide)) {

    gut_metadata <- metadata_wide %>%
      filter(
        str_detect(
          isolation_source,
          regex(pattern, ignore_case = TRUE)
        )
      )

  } else {
    warning(sp, " has no isolation_source column")
    gut_metadata <- metadata_wide[0, ]
  }

  ## save ONLY the gut-filtered metadata
  saveRDS(
    gut_metadata,
    file.path(
      "data/mags_metadata",
      paste0(sp, "_ncbi_metadata.rds")
    )
  )

  selected_accessions <- gut_metadata %>%
    pull(accession) %>%
    unique()

  selected_accessions <- selected_accessions[!is.na(selected_accessions)]

  writeLines(
    selected_accessions,
    file.path(
      "data/mags_metadata",
      paste0(sp, "_ncbi_selected_accessions.txt")
    )
  )
}

4.2 Downloading the MAGs after filtering

Download the MAGs when you finish selecting them

### DOWNLOAD THE MAGS

datasets download genome accession --inputfile ../bacteroides_unifmormis_ncbi_selected_accessions.txt --include genome --filename hafnia_paralvei_selected_genomes.zip
mags_gtdb <- read_tsv("data/mags_metadata/master_mag_index_lactococcus_lactis.tsv") %>% pull(ID)

intersect(selected, mags_gtdb)
# in selected but NOT in mags_gtdb
setdiff(selected, mags_gtdb)
# in mags_gtdb but NOT in selected
setdiff(mags_gtdb, selected)

4.3 Data preparation

4.3.1 EHI MAGs

ehi_mags <- read_csv("data/ehi_mags.csv")
Rows: 8846 Columns: 52
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (36): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, ...
lgl  (1): annotated

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

4.3.2 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

ehi_mags_p <- ehi_mags %>%
  mutate(phylum=str_remove_all(phylum, "p__")) 

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    mutate(phylum=str_remove_all(phylum, "p__")) %>%
    right_join(ehi_mags_p, by=join_by(phylum == phylum)) %>%
    dplyr::select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)
source_colors <- c(EHI = "#8BC63F", GTDB = "#2D522D", NCBI= "#20558A")

host_type_colors <- c(human = "#A90D00" , animal = "darkgreen") 

host_order_colors <- c(
  "Accipitriformes" = "#143AC2",  # Aves
  "Carnivora"       = "#633C2C",  # Mammalia
  "Chiroptera"      = "#1B2021",  # Mammalia
  "Primates"        = "#CC0000",  # Mammalia
  "Rodentia"        = "#FDA991",  # Mammalia
  "Diptera"         = "#FFAE2B",  # Insecta
  "Isoptera"        = "#E27500",  # Insecta
  "Lepidoptera"     = "#C8591D",  # Insecta
  "Squamata"        = "#1D921B"   # Reptilia
)

4.3.3 Sample metadata

genome_metadata <- read_tsv("data/mags_metadata/lactococcus_lactis_FINAL_metadata.tsv")
Rows: 129 Columns: 45
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (28): ID, source, species, gtdb_taxonomy, host_species, host_order, host_class, isolation_source, host, ...
dbl  (8): completeness, contamination, genome_size, GC, N50, contigs, latitude, longitude
lgl  (9): host_summary, gtdb_representative, mimag_high_quality, mimag_medium_quality, host_status, disease,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Helper across the pipeline
is_blank <- function(x) is.na(x) | x == ""

# 1) Recreate host_text (as you had)
genome_metadata <- genome_metadata %>%
  mutate(
    host_text = str_to_lower(
      str_squish(
        paste(
          coalesce(host_species, ""),
          coalesce(host, ""),
          coalesce(isolation_source, ""),
          coalesce(env_broad, ""),
          coalesce(env_local_context, ""),
          sep = " ; "
        )
      )
    )
  )

# 2) Infer host_species ONLY when missing/empty — now with your new species
genome_metadata <- genome_metadata %>%
  mutate(
    host_species = dplyr::case_when(
      # keep explicit host_species if present
      !is_blank(host_species) ~ host_species,

      # Human
      str_detect(host_text, "homo sapiens|\\bhuman\\b|patient|adult|child|infant") ~ "Homo sapiens",

      # Mouse
      str_detect(host_text, "mus musculus|\\bmouse\\b|\\bmice\\b|murine") ~ "Mus musculus",

      # Rat
      str_detect(host_text, "rattus|\\brat\\b") ~ "Rattus norvegicus",

      # Pig / swine
      str_detect(host_text, "sus scrofa|porcine|\\bpig\\b|swine") ~ "Sus scrofa",

      # Cow / bovine
      str_detect(host_text, "bos taurus|bovine|\\bcow\\b|cattle") ~ "Bos taurus",

      # Dog / canine (normalize variants)
      str_detect(host_text, "canis lupus familiaris|canis familiaris|\\bdog\\b|canine") ~ "Canis lupus familiaris",

      # Chicken / poultry
      str_detect(host_text, "gallus gallus|\\bchicken\\b|poultry") ~ "Gallus gallus",

      # Silkworm
      str_detect(host_text, "bombyx mori|silkworm") ~ "Bombyx mori",

      # Drosophila / fruit fly
      str_detect(host_text, "drosophila|fruit fly") ~ "Drosophila melanogaster",

      # Termite
      str_detect(host_text, "\\btermite\\b") ~ "Termite",

      # Zebrafish
      str_detect(host_text, "danio rerio|zebrafish") ~ "Danio rerio",

      # Generic fish (also catch “fishbut”-style glued tokens)
      str_detect(host_text, "\\bfish\\w*\\b") ~ "Actinopterygii (fish)",

      # Bats and other taxa (from your counts)
      str_detect(host_text, "barbastella barbastellus") ~ "Barbastella barbastellus",
      str_detect(host_text, "glauconycteris\\s*sp\\.?") ~ "Glauconycteris sp",
      str_detect(host_text, "hipposideros ruber") ~ "Hipposideros ruber",
      str_detect(host_text, "rhinolophus capensis") ~ "Rhinolophus capensis",

      # Vulture
      str_detect(host_text, "cathartes aura") ~ "Cathartes aura",

      # Grasshopper
      str_detect(host_text, "\\bgrasshopper\\b") ~ "Grasshopper",

      # New primate + fish species from your list
      str_detect(host_text, "macaca fascicularis") ~ "Macaca fascicularis",
      str_detect(host_text, "oncorhynchus mykiss") ~ "Oncorhynchus mykiss",
      str_detect(host_text, "oreochromis niloticus") ~ "Oreochromis niloticus",

      TRUE ~ host_species
    )
  )

# 3) Normalize existing host_species text so class/order rules match reliably
genome_metadata <- genome_metadata %>%
  mutate(
    hs_lower = str_to_lower(host_species),
    host_species = dplyr::case_when(
      # canonicalize common variants
      str_detect(hs_lower, "^canis(\\s+lupus)?\\s+familiaris$") ~ "Canis lupus familiaris",
      str_detect(hs_lower, "^drosophila melanogaster$") ~ "Drosophila melanogaster",
      str_detect(hs_lower, "^bombyx mori$") ~ "Bombyx mori",
      str_detect(hs_lower, "^danio rerio$") ~ "Danio rerio",
      str_detect(hs_lower, "^homo sapiens$") ~ "Homo sapiens",

      # catch glued fish tokens (e.g., "fishbut", "fishsomething")
      str_detect(hs_lower, "^fish\\w*$") ~ "Actinopterygii (fish)",

      TRUE ~ host_species
    )
  ) %>%
  dplyr::select(-hs_lower)

# 4) Fill host_class / host_order ONLY when missing — extended with your species
genome_metadata <- genome_metadata %>%
  mutate(
    # CLASS
    host_class = dplyr::case_when(
      !is_blank(host_class) ~ host_class,

      str_detect(coalesce(host_species, ""), "Homo sapiens|Mus musculus|Rattus norvegicus|Sus scrofa|Bos taurus|Canis lupus familiaris|Barbastella barbastellus|Glauconycteris sp|Hipposideros ruber|Rhinolophus capensis|Macaca fascicularis") ~ "Mammalia",
      str_detect(coalesce(host_species, ""), "Gallus gallus|Cathartes aura") ~ "Aves",
      str_detect(coalesce(host_species, ""), "Drosophila melanogaster|Bombyx mori|Termite|Grasshopper") ~ "Insecta",
      str_detect(coalesce(host_species, ""), "Danio rerio|Oncorhynchus mykiss|Oreochromis niloticus|Actinopterygii") ~ "Actinopterygii",

      TRUE ~ host_class
    ),

    # ORDER
    host_order = dplyr::case_when(
      !is_blank(host_order) ~ host_order,

      # Mammals
      str_detect(coalesce(host_species, ""), "Homo sapiens|Macaca fascicularis") ~ "Primates",
      str_detect(coalesce(host_species, ""), "Mus musculus|Rattus norvegicus") ~ "Rodentia",
      str_detect(coalesce(host_species, ""), "Sus scrofa|Bos taurus") ~ "Artiodactyla",
      str_detect(coalesce(host_species, ""), "Canis lupus familiaris") ~ "Carnivora",
      str_detect(coalesce(host_species, ""), "Barbastella barbastellus|Glauconycteris sp|Hipposideros ruber|Rhinolophus capensis") ~ "Chiroptera",

      # Birds
      str_detect(coalesce(host_species, ""), "Gallus gallus") ~ "Galliformes",
      # Choose one scheme and be consistent; using Accipitriformes here
      str_detect(coalesce(host_species, ""), "Cathartes aura") ~ "Accipitriformes",

      # Insects
      str_detect(coalesce(host_species, ""), "Drosophila melanogaster") ~ "Diptera",
      str_detect(coalesce(host_species, ""), "Bombyx mori") ~ "Lepidoptera",
      str_detect(coalesce(host_species, ""), "Termite") ~ "Isoptera",
      str_detect(coalesce(host_species, ""), "Grasshopper") ~ "Orthoptera",

      # Fish (species-level)
      str_detect(coalesce(host_species, ""), "Danio rerio") ~ "Cypriniformes",
      str_detect(coalesce(host_species, ""), "Oncorhynchus mykiss") ~ "Salmoniformes",
      str_detect(coalesce(host_species, ""), "Oreochromis niloticus") ~ "Cichliformes",

      # Generic fish remains NA (unknown order)
      str_detect(coalesce(host_species, ""), "Actinopterygii \\(fish\\)") ~ NA_character_,

      TRUE ~ host_order
    )
  ) %>%
  # Drop helper column
  dplyr::select(-host_text)
genome_metadata %>%
  filter(is.na(host_class) | is.na(host_order) | host_class == "" | host_order == "") %>%
  dplyr::count(host_species, sort = TRUE)
# A tibble: 3 × 2
  host_species              n
  <chr>                 <int>
1 Actinopterygii (fish)     3
2 Cabbage kimchi            1
3 <NA>                      1
genome_metadata %>%
  filter(host_species %in% c("Barbastella barbastellus", "Hipposideros ruber", "Glauconycteris sp",
                             "Cathartes aura", "Actinopterygii (fish)", "Grasshopper")) %>%
  dplyr::count(host_species, host_class, host_order, sort = TRUE)
# A tibble: 5 × 4
  host_species             host_class     host_order          n
  <chr>                    <chr>          <chr>           <int>
1 Cathartes aura           Aves           Accipitriformes     4
2 Actinopterygii (fish)    Actinopterygii <NA>                3
3 Hipposideros ruber       Mammalia       Chiroptera          2
4 Barbastella barbastellus Mammalia       Chiroptera          1
5 Grasshopper              Insecta        Orthoptera          1
library(countrycode)

# 1) Simplify country strings like "Country: Region"
genome_metadata <- genome_metadata %>%
  mutate(
    country_simple = str_trim(str_split_fixed(coalesce(country, ""), ":", 2)[, 1]),
    country_simple = na_if(country_simple, "")
  )

# 2) Normalization map for common ENA/NCBI variants
.country_map <- c(
  # US variants
  "usa" = "United States",
  "u\\.s\\." = "United States",
  "u\\.s\\.a\\." = "United States",
  "united states of america" = "United States",

  # UK variants
  "uk" = "United Kingdom",
  "u\\.k\\." = "United Kingdom",
  "great britain" = "United Kingdom",
  "england" = "United Kingdom",
  "scotland" = "United Kingdom",
  "wales" = "United Kingdom",
  "northern ireland" = "United Kingdom",

  # Korea variants
  "korea,? republic of" = "South Korea",
  "republic of korea"   = "South Korea",
  "south korea"         = "South Korea",
  "korea,? \\(south\\)" = "South Korea",

  # China variants
  "pr china" = "China",
  "p\\.r\\. china" = "China",

  # Russia variants
  "russian federation" = "Russia",

  # Czech variants
  "czech republic" = "Czechia",

  # Eswatini / Swaziland; Côte d’Ivoire; DRC
  "swaziland" = "Eswatini",
  "cote d['’]ivoire" = "Côte d’Ivoire",
  "democratic republic of the congo" = "DR Congo",

  # Others commonly seen in ENA
  "viet nam" = "Vietnam",
  "myanmar \\(burma\\)" = "Myanmar",
  "bolivia \\(plurinational state of\\)" = "Bolivia",
  "iran,? islamic republic of" = "Iran",
  "syrian arab republic" = "Syria",
  "moldova,? republic of" = "Moldova",
  "lao people'?s democratic republic" = "Laos",
  "macedonia,? the former yugoslav republic of" = "North Macedonia",
  "palestine,? state of" = "Palestine",
  "hong kong" = "Hong Kong",
  "macau|macao" = "Macau"
)

# 3) Apply normalization: lowercase → map → title‑case fallback
normalize_country <- function(x) {
  y <- str_trim(tolower(coalesce(x, "")))
  y[y == ""] <- NA_character_

  # Apply regex replacements from the map (left to right)
  for (pat in names(.country_map)) {
    repl <- .country_map[[pat]]
    y <- ifelse(!is.na(y), str_replace_all(y, paste0("^", pat, "$"), repl), y)
  }

  # General cleanups for accents/spacing variants
  y <- ifelse(!is.na(y), str_replace_all(y, "\\s+", " "), y)
  y <- ifelse(!is.na(y), str_replace_all(y, "^people's republic of china$", "China"), y)

  # If still lowercased plain words (no mapping hit), title‑case them
  # (won’t fix every case, but keeps things readable)
  y <- ifelse(!is.na(y), str_to_title(y), y)

  # Final tidy: replace common leftover patterns
  y <- ifelse(!is.na(y), str_replace_all(y, " And ", " and "), y)  # cosmetic
  y
}


genome_metadata <- genome_metadata %>%
  dplyr::mutate(
    country_normalized = normalize_country(country_simple),
    continent = countrycode::countrycode(
      sourcevar   = country_normalized,
      origin      = "country.name",
      destination = "continent",
      warn        = TRUE
    )
  )
Warning: There was 1 warning in `dplyr::mutate()`.
ℹ In argument: `continent = countrycode::countrycode(...)`.
Caused by warning:
! Some values were not matched unambiguously: None
genome_metadata <- genome_metadata %>%
  # Normalize casing/whitespace for host_species
  mutate(
    host_species_norm = str_squish(host_species),
    host_species_norm = if_else(
      str_detect(str_to_lower(coalesce(host_species_norm, "")), "^homo\\s+sapiens$"),
      "Homo sapiens",
      # Title-case other species names to keep them tidy (optional)
      str_to_title(host_species_norm)
    )
  ) %>%
  # Human vs animal classification
  mutate(
    host_type = case_when(
      host_species_norm == "Homo sapiens" ~ "human",
      !is.na(host_species_norm) & host_species_norm != "" ~ "animal",
      TRUE ~ NA_character_
    )
  ) %>%
  # Keep your normalized species as the main column (optional)
  mutate(host_species = host_species_norm) %>%
  dplyr::select(-host_species_norm)
# Read master index
master_index <- read_tsv("data/mags_metadata/master_mag_index_lactococcus_lactis.tsv") %>%
  mutate(
    # Extract the actual genome identifier from out_path
    genome_identifier = case_when(
      # For EHI: extract EHA00531_bin.1 from the filename
      str_detect(out_path, "EHA") ~ str_extract(out_path, "EHA[0-9]+_bin\\.[0-9]+"),
      # For NCBI: use the ID as is (GCA/GCF number)
      TRUE ~ ID
    )
  )
Rows: 116 Columns: 4
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): species, ID, MAG_url, out_path

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check the mapping
print("Master index mapping:")
[1] "Master index mapping:"
master_index %>% 
  dplyr::select(ID, genome_identifier, out_path) %>%
  head(10) %>%
  print()
# A tibble: 10 × 3
   ID        genome_identifier out_path                                          
   <chr>     <chr>             <chr>                                             
 1 EHM013277 EHA00364_bin.25   data/lactococcus_lactis/mags/EHA00364_bin.25.fa.gz
 2 EHM015868 EHA00350_bin.27   data/lactococcus_lactis/mags/EHA00350_bin.27.fa.gz
 3 EHM016594 EHA00567_bin.9    data/lactococcus_lactis/mags/EHA00567_bin.9.fa.gz 
 4 EHM017294 EHA00776_bin.9    data/lactococcus_lactis/mags/EHA00776_bin.9.fa.gz 
 5 EHM019886 EHA00211_bin.10   data/lactococcus_lactis/mags/EHA00211_bin.10.fa.gz
 6 EHM020035 EHA00169_bin.40   data/lactococcus_lactis/mags/EHA00169_bin.40.fa.gz
 7 EHM025059 EHA01453_bin.48   data/lactococcus_lactis/mags/EHA01453_bin.48.fa.gz
 8 EHM025679 EHA01477_bin.63   data/lactococcus_lactis/mags/EHA01477_bin.63.fa.gz
 9 EHM027656 EHA01634_bin.31   data/lactococcus_lactis/mags/EHA01634_bin.31.fa.gz
10 EHM028200 EHA01709_bin.8    data/lactococcus_lactis/mags/EHA01709_bin.8.fa.gz 
# Read contig-to-genome mapping
contig_to_genome <- read_tsv("data/mags_metadata/lactococcus_lactis_contig_to_mag.tsv",
                              col_names = c("contig", "genome_filename"))
Rows: 10206 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): contig, genome_filename

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

4.3.4 Genome annotations

# Read annotations and add IDs
genome_annotations <- read_tsv("data/mags_metadata/lactococcus_lactis_gene_annotations.tsv.xz") %>%
  mutate(contig = sub("_[^_]*$", "", gene)) %>%
  left_join(contig_to_genome, by = "contig") %>%
  mutate(genome= genome_filename)%>%
   filter(!is.na(genome))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 365396 Columns: 17
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (13): gene, strand, kegg, ec, pfam, cazy, resistance_type, resistance_target, vf, vf_type, signalp, defe...
dbl  (2): start, end
lgl  (2): antidefense, antidefense_type

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

4.3.4.1 REMOVE MAGs that do NOT belong to the species!! (specific lactococcus, rest should be fine)

# Cleaner to normalize identifiers
clean_label <- function(x) {
  x <- basename(x)
  sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
}

# List to remove (MAGs that belong to other species <95% ANI)
mags_to_remove <- c(
  "GCA_018369575.1.fna","GCA_947063445.1.fna","GCA_947101685.1.fna","GCA_948698275.1.fna",
  "GCA_937910935.1.fna","GCA_947072755.1.fna","GCA_948655095.1.fna","GCA_948703095.1.fna",
  "GCA_947041925.1.fna","GCA_947073355.1.fna","GCA_948675165.1.fna","GCA_948718815.1.fna"
)
mags_to_remove_clean <- clean_label(mags_to_remove)

# ---------- genome_metadata: remove + dedup (keep best per ID) ----------
stopifnot("ID" %in% names(genome_metadata))

gm0 <- genome_metadata %>%
  mutate(ID_clean = clean_label(ID))

n_before_meta <- nrow(gm0)
n_to_remove_meta <- sum(gm0$ID_clean %in% mags_to_remove_clean)

genome_metadata_filtered <- gm0 %>%
  filter(!ID_clean %in% mags_to_remove_clean)

has_comp    <- "completeness"   %in% names(genome_metadata_filtered)
has_cont    <- "contamination"  %in% names(genome_metadata_filtered)
has_contigs <- "contigs"        %in% names(genome_metadata_filtered)

if (has_comp || has_cont || has_contigs) {
  genome_metadata_dedup <- genome_metadata_filtered %>%
    mutate(
      completeness  = if (has_comp)    as.numeric(completeness)  else NA_real_,
      contamination = if (has_cont)    as.numeric(contamination) else NA_real_,
      contigs       = if (has_contigs) as.numeric(contigs)       else NA_real_
    ) %>%
    arrange(
      ID_clean,
      desc(if (has_comp) completeness else 0),
      (if (has_cont) contamination else 0),
      (if (has_contigs) contigs else Inf)
    ) %>%
    distinct(ID_clean, .keep_all = TRUE)
} else {
  genome_metadata_dedup <- genome_metadata_filtered %>%
    distinct(ID_clean, .keep_all = TRUE)
}

genome_metadata_dedup <- genome_metadata_dedup %>%
  select(-ID_clean)

# ---------- genome_annotations: ONLY filter (NO dedup) ----------
stopifnot("genome" %in% names(genome_annotations))

ga0 <- genome_annotations %>%
  mutate(genome_clean = clean_label(genome))

n_before_ann <- nrow(ga0)
n_to_remove_ann <- sum(ga0$genome_clean %in% mags_to_remove_clean)

# Remove problematic genomes; keep all remaining genes
genome_annotations_filtered <- ga0 %>%
  filter(!genome_clean %in% mags_to_remove_clean) %>%
  select(-genome_clean)

# ---------- report ----------
cat("genome_metadata:\n")
genome_metadata:
cat("  - rows before:", n_before_meta, "\n")
  - rows before: 129 
cat("  - removed by species list:", n_to_remove_meta, "\n")
  - removed by species list: 0 
cat("  - rows after remove:", nrow(genome_metadata_filtered), "\n")
  - rows after remove: 129 
cat("  - rows after dedup:", nrow(genome_metadata_dedup), "\n\n")
  - rows after dedup: 129 
cat("genome_annotations:\n")
genome_annotations:
cat("  - rows before:", n_before_ann, "\n")
  - rows before: 335828 
cat("  - removed by species list:", n_to_remove_ann, "\n")
  - removed by species list: 0 
cat("  - rows after remove:", nrow(genome_annotations_filtered), "\n")
  - rows after remove: 335828 
# (optional) assign back
genome_metadata   <- genome_metadata_dedup
genome_annotations <- genome_annotations_filtered

4.3.5 Distill annotations into GIFTs

genome_gifts <- distill(genome_annotations,GIFT_db,genomecol= 19, annotcol=c(5,6,7,8), verbosity = F)

Identifiers in the annotation table: 1293 
Identifiers in the database: 1547 
Identifiers in both: 148 
Percentage of annotation table identifiers used for distillation: 11.45%
Percentage of database identifiers used for distillation: 9.57%

4.3.6 Load trees

genome_metadata$mag_name <- sub("\\.fa$", "", genome_metadata$mag_name)#remove .fa from the mag names so that they match the tree ids