Chapter 11 Functions and helpers

source_colors <- c(EHI = "#8BC63F", GTDB = "#2D522D", NCBI= "#20558A")

host_type_colors <- c(human = "#A90D00" , animal = "darkgreen") 

host_order_colors <- c(
  "Accipitriformes" = "#143AC2",  # Aves
  "Carnivora"       = "#633C2C",  # Mammalia
  "Chiroptera"      = "#1B2021",  # Mammalia
  "Primates"        = "#CC0000",  # Mammalia
  "Rodentia"        = "#FDA991",  # Mammalia
  "Diptera"         = "#FFAE2B",  # Insecta
  "Isoptera"        = "#E27500",  # Insecta
  "Lepidoptera"     = "#C8591D",  # Insecta
  "Squamata"        = "#1D921B"   # Reptilia
)

country_palette <- c(
  # Southern Europe
  "Spain" = "#1B9E77",
  "Italy" = "#33A02C",
  "Greece" = "#66C2A5",
  "Portugal" = "#2CA25F",
  "Malta" = "#99D8C9",

  # Northern/Central Europe
  "Germany" = "#1F78B4",
  "United Kingdom" = "#4A90E2",
  "Ireland" = "#6BAED6",

  # East Asia 
  "Japan" = "#E31A1C",
  "South Korea" = "#FB6A4A",
  "China" = "#CB181D",

  # North America 
  "USA" = "#756BB1",
  "Canada" = "#9E9AC8",

  # Distinct
  "Australia" = "#FFD92F",   
  "Greenland" = "#A6CEE3",

  "none" = "grey70"
)

11.1 DATA PREPARATION

11.1.1 Load all the data

## Load all the data 
load_species_files <- function(species, base_dir = "data/mags_metadata") {

  paths <- list(
    ehi_metadata       = file.path(base_dir, paste0(species, "_metadata.tsv")), #EH metadata
    gtdb_final         = file.path(base_dir, paste0(species, "_gtdb_final_metadata.tsv")), #GTDB metadata
    contig_map         = file.path(base_dir, paste0(species, "_contig_to_mag.tsv")), #contig map to genome
    gene_annotations   = file.path(base_dir, paste0(species, "_gene_annotations.tsv.xz")), #functional annotations
    ncbi_metadata_rds  = file.path(base_dir, paste0(species, "_ncbi_metadata.rds")),  #NCBI metadata
    ndb                = file.path(base_dir, paste0(species, "_Ndb.csv")) # FASTANI
  )

  load_if_exists <- function(path, loader) {
    if (file.exists(path)) loader(path) else NULL
  }

  result <- list(
    ehi_metadata     = load_if_exists(paths$ehi_metadata,       function(p) readr::read_tsv(p, show_col_types = FALSE)),
    gtdb_final       = load_if_exists(paths$gtdb_final,         function(p) readr::read_tsv(p, show_col_types = FALSE)),
    contig_map       = load_if_exists(paths$contig_map,         function(p) readr::read_tsv(p, show_col_types = FALSE)),
    gene_annotations = load_if_exists(paths$gene_annotations,   function(p) readr::read_tsv(p, show_col_types = FALSE)),
    ndb              = load_if_exists(paths$ndb,   function(p) readr::read_csv(p, show_col_types = FALSE)),
    ncbi_metadata    = load_if_exists(paths$ncbi_metadata_rds,  readRDS))

  return(result)
}
# Find the most plausible genome size column and rename to genome_size
detect_and_rename_genome_size <- function(df) {
  if (is.null(df)) return(df)

  # If exact already present, nothing to do
  if ("genome_size" %in% names(df)) return(df)

  # Common explicit variants (ordered by preference)
  candidates <- c(
    # EHI 
    "size", "Size", "SIZE",
    # NCBI 
    "genome_size...4", "genome_size...43",
    # other common labels
    "size_bp", "Size_bp", "genome_size_bp",
    "total_length", "assembly_length", "estimated_size"
  )

  hit <- intersect(candidates, names(df))
  if (length(hit) >= 1) {
    # If multiple hits appear, prefer the first listed above
    df <- dplyr::rename(df, genome_size = !!hit[1])
    return(df)
  }

  # Fallback: first column containing "genome_size" or "size" or "length"
  fuzzy <- grep("(genome_)?size|length", names(df), ignore.case = TRUE, value = TRUE)
  if (length(fuzzy) >= 1) {
    df <- dplyr::rename(df, genome_size = !!fuzzy[1])
    return(df)
  }

  df
}

# Normalize 'n50' (lowercase in some NCBI exports) to 'N50'
detect_and_rename_n50 <- function(df) {
  if (is.null(df)) return(df)
  if (!("N50" %in% names(df)) && ("n50" %in% names(df))) {
    df <- dplyr::rename(df, N50 = n50)
  }
  df
}

# Enforce consistent types across sources before bind_rows()
harmonize_schema <- function(df) {
  if (is.null(df)) return(NULL)


  # Column name standardization

  rename_map <- c(
    "isolation_source.x" = "isolation_source",
    "isolation_source.y" = "isolation_source",
    "isolation_source_x" = "isolation_source",
    "isolation_source_y" = "isolation_source",

    "geo_loc_name"       = "country",
    "geographic location (latitude)"  = "latitude",
    "geographic location (longitude)" = "longitude"
  )

  # Rename if present
  for (old in names(rename_map)) {
    if (old %in% names(df)) {
      names(df)[names(df) == old] <- rename_map[[old]]
    }
  }


  #  Define expected schema

  char_cols <- c(
    "ID","source","species","gtdb_taxonomy","gtdb_representative",
    "isolation_source","host","host_status","disease","diagnosis",
    "host_species","host_order","host_class","host_summary",
    "country","locality","collection_date","ncbi_biosample",
    "mag_name","eha_number","common_name","sample_name","external_id","submitter_id",
    "env_broad","env_medium","env_local","env_local_context"
  )

  num_cols <- c(
    "completeness","contamination","genome_size","GC","N50","contigs",
    "latitude","longitude"
  )

  logi_cols <- c("mimag_high_quality","mimag_medium_quality")

  #  Add missing columns

  for (cc in char_cols) if (!cc %in% names(df)) df[[cc]] <- NA_character_
  for (nc in num_cols)  if (!nc %in% names(df)) df[[nc]] <- NA_real_
  for (lc in logi_cols) if (!lc %in% names(df)) df[[lc]] <- NA


  #  Type-coerce consistently

  df <- df %>%
    dplyr::mutate(
      dplyr::across(
        dplyr::all_of(char_cols),
        ~ as.character(.x)
      ),
      dplyr::across(
        dplyr::all_of(num_cols),
        ~ suppressWarnings(as.numeric(.x))
      ),
      dplyr::across(
        dplyr::all_of(logi_cols),
        ~ as.logical(.x)
      )
    )


  #  Return cleaned df

  df
}


# ---- Prepare gene annotations: add the genome with contig_map ----
prep_gene_annotations_with_genome <- function(ann_raw, contig_map_raw) {
  if (is.null(ann_raw)) return(NULL)
  if (is.null(contig_map_raw)) {
    stop("contig_map is required to attach 'genome' to gene_annotations.")
  }

  # --- Standardize contig_map to columns: contig, genome ---
  cm <- contig_map_raw

  contig_col_candidates <- intersect(names(cm), c("contig", "contig_id", "contig_name", "scaffold"))
  if (length(contig_col_candidates) == 0) {
    stop("contig_map must have a contig column (one of: contig, contig_id, contig_name, scaffold).")
  }

  genome_col_candidates <- intersect(
    names(cm),
    c("genome", "genome_filename", "mag", "bin", "genome_id", "assembly")
  )
  if (length(genome_col_candidates) == 0) {
    stop("contig_map must have a genome column (e.g., genome, genome_filename, mag, bin, genome_id, assembly).")
  }

  cm <- cm %>%
    dplyr::rename(
      contig = !!rlang::sym(contig_col_candidates[1]),
      genome = !!rlang::sym(genome_col_candidates[1])
    ) %>%
    dplyr::mutate(
      contig = as.character(contig),
      genome = as.character(genome)
    )

  # --- Ensure annotations have a contig column ---
  an <- ann_raw

  if (!("contig" %in% names(an))) {
    # Try to derive from a gene-like column by stripping the last _segment
    gene_col_candidates <- intersect(names(an), c("gene", "gene_id", "locus_tag", "feature_id"))
    if (length(gene_col_candidates) == 0) {
      stop("gene_annotations must have `contig` or a gene-like column (gene/gene_id/locus_tag/feature_id).")
    }

    an <- an %>%
      dplyr::rename(.gene_tmp = !!rlang::sym(gene_col_candidates[1])) %>%
      dplyr::mutate(contig = sub("_[^_]*$", "", .data$.gene_tmp)) %>%
      dplyr::select(-dplyr::any_of(".gene_tmp"))
  }

  # --- Join to add genome and filter out missing ---
  an <- an %>%
    dplyr::mutate(contig = as.character(contig)) %>%
    dplyr::left_join(cm, by = "contig") %>%
    dplyr::filter(!is.na(.data$genome))

  return(an)
}


clean_biosample_value <- function(v) {
  if (is.na(v)) return(NA_character_)
  v <- str_trim(v)
  m <- str_match(v, '^Attribute\\s+"(.*)"$')
  if (!is.na(m[1,2])) v <- str_trim(m[1,2])
  v_low <- tolower(v)
  if (v_low %in% c("missing","not provided","not applicable","na","n/a","none","unknown",
                   "unspecified","not collected","not determined","not available")) return(NA_character_)
  if (v == "") return(NA_character_)
  v
}

11.1.2 Prepare EHI metadata

# Prepare EHI metadata into a standardized schema
prep_ehi_metadata <- function(ehi_raw) {
  if (is.null(ehi_raw)) return(NULL)

  ehi_raw <- detect_and_rename_genome_size(ehi_raw)
  ehi_raw <- detect_and_rename_n50(ehi_raw)

  ehi_clean <- ehi_raw %>%
    mutate(
      GC = suppressWarnings(as.numeric(str_remove(GC, "%")))
    ) %>%
    dplyr::select(
      any_of(c(
        "ID","species","completeness","contamination",
        "genome_size","GC","N50","contigs",
        "host_species","host_order","host_class",
        "sample_type","isolation_source","country","locality",
        "mag_name","eha_number"
      ))
    ) %>%
    mutate(
      isolation_source = coalesce(isolation_source, sample_type),
      source = "EHI"
    ) %>%
    dplyr::select(-sample_type)

  harmonize_schema(ehi_clean)
}

11.1.3 Prepare GTDB metadata

prep_gtdb_final <- function(gtdb_raw) {
  if (is.null(gtdb_raw)) return(NULL)

  # Standardize size & N50 labels
  gtdb_raw <- detect_and_rename_genome_size(gtdb_raw)
  gtdb_raw <- detect_and_rename_n50(gtdb_raw)

  gtdb_clean <- gtdb_raw %>%
    # Pick only columns we may need (robust to presence/absence)
    dplyr::select(dplyr::any_of(c(
      "ID","accession","gtdb_taxonomy","gtdb_representative",
      "checkm2_completeness","checkm2_contamination",
      "genome_size","gc_percentage","N50","n50_contigs",
      "contigs","contig_count",
      "ncbi_biosample","country","geo_loc_name",
      "isolation_source","isolation_source.x","isolation_source_x",
      "isolation_source.y","isolation_source_y",
      "host","host_status","disease","diagnosis","host_summary",
      # latitude/longitude may appear with ENA-style names; keep both
      "latitude","longitude","geographic location (latitude)","geographic location (longitude)",
      # environment / context variants
      "env_broad","env_broad_scale","env_medium","env_local","env_local_scale",
      "env_local_context","local environmental context",
      # names with spaces/case variants from ENA
      "common_name","common name","sample_name","external_id","External Id","submitter_id","Submitter Id",
      "collection_date"
    ))) %>%
    # Normalize N50/contigs if only alt names exist
    { if (!"N50" %in% names(.) && "n50_contigs" %in% names(.)) dplyr::rename(., N50 = n50_contigs) else . } %>%
    { if (!"contigs" %in% names(.) && "contig_count" %in% names(.)) dplyr::rename(., contigs = contig_count) else . } %>%
    # Conditional renames for ENA/GTDB variants (no-op if missing)
    { if ("env_broad_scale" %in% names(.)) dplyr::rename(., env_broad = `env_broad_scale`) else . } %>%
    { if ("env_local_scale" %in% names(.)) dplyr::rename(., env_local = `env_local_scale`) else . } %>%
    { if ("local environmental context" %in% names(.)) dplyr::rename(., env_local_context = `local environmental context`) else . } %>%
    { if ("common name" %in% names(.)) dplyr::rename(., common_name = `common name`) else . } %>%
    { if ("External Id" %in% names(.)) dplyr::rename(., external_id = `External Id`) else . } %>%
    { if ("Submitter Id" %in% names(.)) dplyr::rename(., submitter_id = `Submitter Id`) else . } %>%
    # Add core source + numeric fields
    dplyr::mutate(
      source        = "GTDB",
      completeness  = .data$checkm2_completeness,
      contamination = .data$checkm2_contamination,
      GC            = .data$gc_percentage
    )

  #  Coalesce isolation_source variants safely (handle dot/underscore suffixes)
  iso_candidates <- c("isolation_source","isolation_source.x","isolation_source_x",
                      "isolation_source.y","isolation_source_y")
  present_iso <- intersect(iso_candidates, names(gtdb_clean))
  if (length(present_iso) > 0) {
    # progressive coalesce across present variants
    iso <- gtdb_clean[[present_iso[1]]]
    if (length(present_iso) > 1) {
      for (nm in present_iso[-1]) iso <- dplyr::coalesce(iso, gtdb_clean[[nm]])
    }
    gtdb_clean[["isolation_source"]] <- iso
  }

  # Prefer GTDB 'country', else ENA 'geo_loc_name'
  if ("geo_loc_name" %in% names(gtdb_clean)) {
    gtdb_clean[["country"]] <- dplyr::coalesce(gtdb_clean[["country"]], gtdb_clean[["geo_loc_name"]])
  }

  #  Drop helper/duplicate columns we normalized away
  gtdb_clean <- gtdb_clean %>%
    dplyr::select(-dplyr::any_of(c(
      "checkm2_completeness","checkm2_contamination","gc_percentage",
      "n50_contigs","contig_count","geo_loc_name",
      "isolation_source.x","isolation_source_x","isolation_source.y","isolation_source_y"
    )))

  # Final schema/type harmonization (handles lat/long ENA names, adds missing cols)
  gtdb_clean <- harmonize_schema(gtdb_clean)

  return(gtdb_clean)
}

11.1.4 Prepare NCBI metadata

# Prepare NCBI metadata into standardized schema
prep_ncbi_metadata <- function(ncbi_raw) {
  if (is.null(ncbi_raw)) return(NULL)

  ncbi_raw <- detect_and_rename_genome_size(ncbi_raw)
  ncbi_raw <- detect_and_rename_n50(ncbi_raw)

  ncbi_clean <- ncbi_raw %>%
    dplyr::select(
      dplyr::any_of(c(
        "accession","completeness","contamination",
        "genome_size","gc_percent","isolation_source","host",
        "collection_date","geo_loc_name","N50","contigs","host_disease","host_age"
      ))
    ) %>%
    dplyr::rename(
      ID = accession,
      country = geo_loc_name,
      GC = gc_percent
    ) %>%
    dplyr::mutate(
      source = "NCBI"
    )

  harmonize_schema(ncbi_clean)
}

11.1.5 Combine metadata sources into genome_metadata

# Combine EHI + GTDB(final) + NCBI into one unified table
combine_metadata_sources <- function(ehi_clean,
                                     gtdb_final_clean,
                                     ncbi_clean) {
  tables <- list(ehi_clean, gtdb_final_clean, ncbi_clean)
  tables <- tables[!vapply(tables, is.null, logical(1))]

  if (length(tables) == 0) {
    # No sources available: return an empty tibble 
    genome_metadata <- harmonize_schema(tibble::tibble())
  } else {
    # Ensure schema per table, then bind
    tables <- lapply(tables, harmonize_schema)
    genome_metadata <- dplyr::bind_rows(tables)
    # One more pass to guarantee presence of all expected columns
    genome_metadata <- harmonize_schema(genome_metadata)
  }

  # Safe numeric coercions in columns that exist
  genome_metadata <- genome_metadata %>%
  dplyr::mutate(
    completeness  = if ("completeness"   %in% names(.)) suppressWarnings(as.numeric(completeness))  else NA_real_,
    contamination = if ("contamination"  %in% names(.)) suppressWarnings(as.numeric(contamination)) else NA_real_,
    GC            = if ("GC"             %in% names(.)) suppressWarnings(as.numeric(GC))            else NA_real_,
    genome_size   = if ("genome_size"    %in% names(.)) suppressWarnings(as.numeric(genome_size))   else NA_real_,
    N50           = if ("N50"            %in% names(.)) suppressWarnings(as.numeric(N50))           else NA_real_,
    contigs       = if ("contigs"        %in% names(.)) suppressWarnings(as.numeric(contigs))       else NA_real_
  )

  # Reorder columns if present
  front_cols <- c(
    "ID","source","species","gtdb_taxonomy",
    "host_species","host_order","host_class",
    "isolation_source","host","host_summary",
    "country","locality",
    "completeness","contamination","genome_size","GC","N50","contigs",
    "collection_date","ncbi_biosample","mag_name","eha_number",
    "gtdb_representative","mimag_high_quality","mimag_medium_quality",
    "common_name","sample_name","external_id","submitter_id",
    "env_broad","env_medium","env_local","env_local_context"
  )
  front_cols <- intersect(front_cols, colnames(genome_metadata))
  genome_metadata <- genome_metadata %>%
    dplyr::relocate(dplyr::all_of(front_cols), .before = 1)

  if (nrow(genome_metadata) > 0 && all(is.na(genome_metadata$genome_size))) {
    warning("All genome_size values are NA after merging. Check size mapping.")
  }

  genome_metadata
}

11.1.6 Infer host metadata

normalize_species <- function(s) {
  # Vectorized normalization of binomials
  s <- as.character(s)
  s[is.na(s) | s == ""] <- NA_character_

  s_low <- stringr::str_to_lower(s)

  # Canonicalize known synonyms/typos
  out <- dplyr::case_when(
    # Dog
    stringr::str_detect(s_low, "^canis\\s+familiaris$") ~ "Canis lupus familiaris",
    stringr::str_detect(s_low, "^canis\\s+lupus\\s+familiaris$") ~ "Canis lupus familiaris",
    stringr::str_detect(s_low, "^mammuthus\\s+primi") ~ "Mammuthus primigenius",
    TRUE ~ s
  )

  # Title-case genus, lower species where we have at least two tokens
  reformat <- function(x) {
    if (is.na(x) || x == "") return(NA_character_)
    parts <- strsplit(x, "\\s+")[[1]]
    if (length(parts) < 2) return(x)
    paste0(stringr::str_to_title(parts[1]), " ", stringr::str_to_lower(parts[2]))
  }
  # Vectorize reformat over 'out'
  out[] <- vapply(out, reformat, character(1))
  out
}
#----------------------------------------------------------------------------------

# Accept only true-looking Latin binomials and reject common non-species nouns
is_latin_binomial_strict <- function(x) {
  x <- as.character(x)
  ok_shape <- grepl("^[A-Z][a-z]+\\s+[a-z]{3,}$", x)  # Genus species
  if (!any(ok_shape, na.rm = TRUE)) return(rep(FALSE, length(x)))

  bad_terms <- c(
    "human","patient","infant","child","adult",
    "feces","faeces","stool","urine","blood","saliva","swab","swabs",
    "skin","gut","intestinal","rumen","content","contents",
    "sample","metagenome","environment","environmental","tissue",
    "rectal","oral","nasal","throat","vaginal","anal"
  )

  reject <- rep(FALSE, length(x))
  ix <- which(ok_shape & !is.na(x))
  if (length(ix) > 0) {
    parts <- strsplit(x[ix], "\\s+")
    for (i in seq_along(ix)) {
      w <- tolower(parts[[i]])
      reject[ix[i]] <- any(w %in% bad_terms)
    }
  }
  ok_shape & !reject
}


#----------------------------------------------------------------
infer_host_metadata <- function(genome_metadata) {

  if (is.null(genome_metadata)) return(NULL)

  # Utility
  is_blank <- function(x) is.na(x) | x == ""
  
  tax_map <- tibble::tibble(
  host_species = c(
    # Humans / lab / livestock / pets
    "Homo sapiens",
    "Mus musculus",
    "Rattus norvegicus",
    "Sus scrofa",
    "Bos taurus",
    "Canis lupus familiaris",
    "Equus caballus",
    "Oryctolagus cuniculus",
    "Capra hircus",

    # Birds
    "Gallus gallus",
    "Cathartes aura",
    "Columba livia",
    "Spheniscus magellanicus",

    # Insects
    "Drosophila melanogaster",
    "Bombyx mori",

    # Fish
    "Danio rerio",
    "Oncorhynchus mykiss",
    "Oreochromis niloticus",

    # Marine mammals / pinnipeds
    "Balaenoptera acutorostrata",
    "Arctocephalus australis",

    # Extinct
    "Mammuthus primigenius",
    
    #More
    "Loxodonta africana",
    "Elephas maximus",
    "Felis catus",
    "Canis lupus",
    "Phascolarctos cinereus",
    
    "Grasshopper",
    "Termite",
    "Macaca fascicularis",
    "Rhinolophus capensis",
    "Fish"

  


  ),
  tax_host_order = c(
    # Humans / lab / livestock / pets
    "Primates",
    "Rodentia",
    "Rodentia",
    "Artiodactyla",
    "Artiodactyla",
    "Carnivora",
    "Perissodactyla",
    "Lagomorpha",
    "Artiodactyla",

    # Birds
    "Galliformes",
    "Accipitriformes",
    "Columbiformes",
    "Sphenisciformes",

    # Insects
    "Diptera",
    "Lepidoptera",

    # Fish
    "Cypriniformes",
    "Salmoniformes",
    "Cichliformes",

    # Marine mammals / pinnipeds
    "Artiodactyla",      # (Cetartiodactyla)
    "Carnivora",

    # Extinct
    "Proboscidea",
    
    #More
    "Proboscidea",
    "Proboscidea",
    "Carnivora",
    "Carnivora",
    "Diprotodontia",
    
    
    "Orthoptera",     # Grasshopper
    "Blattodea",      # Termite (termites now placed within Blattodea; Isoptera as infraorder)
    "Primates",       # Macaca fascicularis
    "Chiroptera",     # Rhinolophus capensis (horseshoe bat)
    NA_character_     # Fish (no single order fits all fish)



  ),
  tax_host_class = c(
    # Humans / lab / livestock / pets
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",

    # Birds
    "Aves",
    "Aves",
    "Aves",
    "Aves",

    # Insects
    "Insecta",
    "Insecta",

    # Fish
    "Actinopterygii",
    "Actinopterygii",
    "Actinopterygii",

    # Marine mammals / pinnipeds
    "Mammalia",
    "Mammalia",

    # Extinct and more
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    "Mammalia",
    
        
    "Insecta",        # Grasshopper
    "Insecta",        # Termite
    "Mammalia",       # Macaca fascicularis
    "Mammalia",       # Rhinolophus capensis
    "Actinopterygii"  # Fish (bony fishes; safe default for generic "fish")



  )
)
  

  # --- 1) Unified host text for pattern matching ---
  genome_metadata <- genome_metadata %>%
    dplyr::mutate(
      host_text = stringr::str_to_lower(
        stringr::str_squish(
          paste(
            dplyr::coalesce(.data$host_species, ""),
            dplyr::coalesce(.data$host, ""),
            dplyr::coalesce(.data$isolation_source, ""),
            dplyr::coalesce(.data$env_broad, ""),
            dplyr::coalesce(.data$env_local_context, ""),
            sep = " ; "
          )
        )
      )
    )

  # --- 2) Common-name / phrase heuristics (vectorized) ---
  guess_from_common <- function(txt) {
    dplyr::case_when(
      # Exact species mentions / synonyms
      stringr::str_detect(txt, "\\barctocephalus australis\\b|\\bfur seal\\b|south american fur seal") ~ "Arctocephalus australis",
      stringr::str_detect(txt, "\\bbalaenoptera acutorostrata\\b|\\bminke whale\\b") ~ "Balaenoptera acutorostrata",
      stringr::str_detect(txt, "\\bspheniscus magellanicus\\b|\\bmagellanic penguin\\b|\\bpenguin\\b") ~ "Spheniscus magellanicus",

      
 #Elephants
    stringr::str_detect(txt, "\\basian elephant\\b") ~ "Elephas maximus",
    stringr::str_detect(txt, "\\bafrican elephant\\b") ~ "Loxodonta africana",
    stringr::str_detect(txt, "\\belephants?\\b") ~ "Loxodonta africana",  # default choice
    #cats
    stringr::str_detect(txt, "\\b(cat|cats|kitten|kittens|feline|house\\s*cat)\\b") ~ "Felis catus",
    stringr::str_detect(txt, "\\bfelis\\s+catus\\b") ~ "Felis catus",

    # dogs 
    stringr::str_detect(txt, "\\b(dog|dogs|puppy|puppies|canine|pet\\s*dog)\\b") ~ "Canis lupus familiaris",
    stringr::str_detect(txt, "\\bcanis\\s+lupus\\s+familiaris\\b|\\bcanis\\s+familiaris\\b") ~ "Canis lupus familiaris",

    # gray wolf (wild Canis lupus) ---
    stringr::str_detect(txt, "\\bcanis\\s+lupus\\b|\\bgray\\s*wolf\\b|\\bgrey\\s*wolf\\b|\\bwolf\\b") ~ "Canis lupus",
 
 #grasshopper
   stringr::str_detect(txt, "\\bgrasshopper\\b|\\blocust\\b|\\blocusts\\b") ~ "Grasshopper",

 
      stringr::str_detect(txt, "\\brabbit\\b|\\bbunny\\b|\\bbunnies\\b|\\blagomorph\\b") ~ "Oryctolagus cuniculus",
      stringr::str_detect(txt, "\\bpigeon\\b|\\bdove\\b") ~ "Columba livia",
      stringr::str_detect(txt, "\\bequine\\b|\\bhorse\\b|\\bfoal\\b") ~ "Equus caballus",
      stringr::str_detect(txt, "\\bcattle\\b|\\bbovine\\b|\\bcow\\b|\\bcows\\b") ~ "Bos taurus",
      stringr::str_detect(txt, "\\bgoat\\b|\\bcaprine\\b|\\bkid\\b") ~ "Capra hircus",
      stringr::str_detect(txt, "\\bpiglet\\b|\\bpiglets\\b|\\bpig\\b|\\bswine\\b|\\bporcine\\b") ~ "Sus scrofa",
      stringr::str_detect(txt, "\\bmammoth\\b|malolyakovskiy\\s+mammoth") ~ "Mammuthus primigenius",

      # Existing helpful ones
      stringr::str_detect(txt, "homo sapiens|\\bhuman\\b|patient|adult|child|infant") ~ "Homo sapiens",
      stringr::str_detect(txt, "mus musculus|\\bmouse\\b|\\bmice\\b|murine") ~ "Mus musculus",
      stringr::str_detect(txt, "\\brattus|\\brat\\b") ~ "Rattus norvegicus",
      stringr::str_detect(txt, "sus scrofa|porcine|\\bpig\\b|swine") ~ "Sus scrofa",
      stringr::str_detect(txt, "bos taurus|bovine|\\bcow\\b|cattle") ~ "Bos taurus",
      stringr::str_detect(txt, "canis lupus familiaris|canis familiaris|\\bdog\\b|canine") ~ "Canis lupus familiaris",
      stringr::str_detect(txt, "gallus gallus|\\bchicken\\b|poultry") ~ "Gallus gallus",
      stringr::str_detect(txt, "bombyx mori|silkworm") ~ "Bombyx mori",
      stringr::str_detect(txt, "drosophila|fruit fly") ~ "Drosophila melanogaster",
      stringr::str_detect(txt, "danio rerio|zebrafish") ~ "Danio rerio",
      stringr::str_detect(txt, "oncorhynchus mykiss") ~ "Oncorhynchus mykiss",
      stringr::str_detect(txt, "oreochromis niloticus") ~ "Oreochromis niloticus",
 
     stringr::str_detect(txt, "\\btermite(s)?\\b") ~ "Termite",
    
        # Macaca fascicularis (Latin or common names) ---
        stringr::str_detect(txt, "\\bmacaca\\s+fascicularis\\b|\\bcynomolgus\\b|\\bcrab[-]eating\\s+macaque\\b|\\bmacaque\\b") ~ "Macaca fascicularis",
    
        # Rhinolophus capensis (Latin or common name) ---
        stringr::str_detect(txt, "\\brhinolophus\\s+capensis\\b|\\brhinolopus\\s+capensis\\b|\\bcape\\s+horseshoe\\s+bat\\b|\\bhorseshoe\\s+bat\\b") ~ "Rhinolophus capensis",
    
        # Generic fish catch-all  ---
        stringr::str_detect(txt, "\\bfish\\b|\\bfishes\\b") ~ "Fish",
      TRUE ~ NA_character_
    )
  }

  genome_metadata <- genome_metadata %>%
    dplyr::mutate(.common_guess = guess_from_common(.data$host_text))

  # --- 3) Extract explicit Latin binomial (strict) as a later fallback ---
  extract_latin_binomial_strict <- function(x) {
    x <- x[!is.na(x)]
    if (length(x) == 0) return(NA_character_)
    txt <- paste(x, collapse = " ; ")
    txt <- stringr::str_replace_all(txt, "[^A-Za-z ]+", " ")
    txt <- stringr::str_squish(txt)
    matches <- unlist(stringr::str_extract_all(
      txt, "\\b[A-Z][a-z]+\\s+[a-z]{3,}\\b"
    ))
    if (length(matches) == 0) return(NA_character_)
    valid <- matches[is_latin_binomial_strict(matches)]
    if (length(valid) == 0) return(NA_character_)
    valid[1]
  }

  genome_metadata <- genome_metadata %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      .binom_guess = extract_latin_binomial_strict(c(
        as.character(.data$host_species),
        as.character(.data$host),
        as.character(.data$isolation_source),
        as.character(.data$env_broad),
        as.character(.data$env_local_context)
      ))
    ) %>%
    dplyr::ungroup()

  # --- 4) Decide host_species (trust valid binomial, then common-name, then binomial guess) ---
  bad_terms <- "(feces|faeces|stool|urine|blood|saliva|swab|swabs|skin|gut|intestinal|rumen|content|contents|sample|metagenome|environment|environmental|patient|infant|adult|child|rectal|oral|nasal|throat|vaginal|anal)"

  genome_metadata <- genome_metadata %>%
    dplyr::mutate(
      host_species = dplyr::if_else(
        is_latin_binomial_strict(.data$host_species) &
          !stringr::str_detect(stringr::str_to_lower(.data$host_species), bad_terms),
        normalize_species(.data$host_species),
        NA_character_
      ),
      host_species = dplyr::case_when(
        !is_blank(.data$host_species) ~ .data$host_species,
        !is_blank(.data$.common_guess) ~ normalize_species(.data$.common_guess),  # PRIORITIZE common-name mapping
        !is_blank(.data$.binom_guess)  ~ normalize_species(.data$.binom_guess),  # then extracted binomial
        TRUE ~ .data$host_species
      )
    )

  # --- 5) Join to fill order/class from dictionary ---
  genome_metadata <- genome_metadata %>%
    dplyr::left_join(tax_map, by = "host_species") %>%
    dplyr::mutate(
      host_order = dplyr::coalesce(.data$host_order, .data$tax_host_order),
      host_class = dplyr::coalesce(.data$host_class, .data$tax_host_class)
    ) %>%
    dplyr::select(-dplyr::any_of(c("tax_host_order","tax_host_class")))

  # --- 6) host_type ---
  genome_metadata <- genome_metadata %>%
    dplyr::mutate(
      host_type = dplyr::case_when(
        .data$host_species == "Homo sapiens" ~ "human",
        !is_blank(.data$host_species) ~ "animal",
        TRUE ~ NA_character_
      )
    )

  # Cleanup helpers
  genome_metadata <- genome_metadata %>%
    dplyr::select(-dplyr::any_of(c(".common_guess",".binom_guess","host_text")))

  return(genome_metadata)
}

11.1.7 Normalize country and infer continent

# Normalize country names and assign continents
normalize_country_fields <- function(genome_metadata) {
  if (is.null(genome_metadata)) return(NULL)

  # Be explicit about packages used
  # library(dplyr); library(stringr); library(countrycode)

  # 1) Simplify country strings like "Country: region"
  genome_metadata <- genome_metadata %>%
    dplyr::mutate(
      country_simple = stringr::str_trim(
        stringr::str_split_fixed(dplyr::coalesce(.data$country, ""), ":", 2)[, 1]
      ),
      country_simple = dplyr::na_if(.data$country_simple, "")
    )

  # 2) Normalization map for common ENA/NCBI variants (patterns in lower-case)
  country_map <- c(
    "usa" = "United States", "u\\.s\\." = "United States",
    "u\\.s\\.a\\." = "United States", "united states of america" = "United States",
    "uk" = "United Kingdom", "u\\.k\\." = "United Kingdom",
    "great britain" = "United Kingdom", "england" = "United Kingdom",
    "scotland" = "United Kingdom", "wales" = "United Kingdom",
    "northern ireland" = "United Kingdom",
    "korea,? republic of" = "South Korea", "republic of korea" = "South Korea",
    "south korea" = "South Korea", "korea,? \\(south\\)" = "South Korea",
    "pr china" = "China", "p\\.r\\. china" = "China",
    "russian federation" = "Russia", "czech republic" = "Czechia",
    "swaziland" = "Eswatini", "cote d['’]ivoire" = "Côte d’Ivoire",
    "democratic republic of the congo" = "DR Congo",
    "viet nam" = "Vietnam", "myanmar \\(burma\\)" = "Myanmar",
    "bolivia \\(plurinational state of\\)" = "Bolivia",
    "iran,? islamic republic of" = "Iran",
    "syrian arab republic" = "Syria",
    "moldova,? republic of" = "Moldova",
    "lao people'?s democratic republic" = "Laos",
    "macedonia,? the former yugoslav republic of" = "North Macedonia",
    "palestine,? state of" = "Palestine",
    "hong kong" = "Hong Kong",
    "macau|macao" = "Macau"
  )

  normalize_country <- function(x) {
    # Always return character
    y <- stringr::str_trim(tolower(dplyr::coalesce(as.character(x), "")))
    y[y == ""] <- NA_character_

    if (length(y)) {
      for (pat in names(country_map)) {
        repl <- country_map[[pat]]
        # Use ^...$ to only replace whole-string matches after lowercasing
        y <- ifelse(!is.na(y),
                    stringr::str_replace_all(y, paste0("^", pat, "$"), repl),
                    y)
      }
      # Minor tidy-ups
      y <- ifelse(!is.na(y), stringr::str_replace_all(y, "\\s+", " "), y)
      y <- ifelse(y == "people's republic of china", "China", y)

      # Title-case anything still not mapped
      y <- ifelse(!is.na(y), stringr::str_to_title(y), y)
    }
    return(as.character(y))
  }

  # 3) Produce normalized country + continent (force character vector into countrycode)
  genome_metadata <- genome_metadata %>%
    dplyr::mutate(
      country_normalized = normalize_country(.data$country_simple),
      continent = countrycode::countrycode(
        sourcevar    = as.character(.data$country_normalized),
        origin       = "country.name",
        destination  = "continent",
        warn         = FALSE
      )
    )

  return(genome_metadata)
}

11.1.8 Filter species that don’t belong and remove duplicate MAGs

# Remove non-species MAGs and deduplicate genomes by quality,
#  merge GCA/GCF duplicates (keep GCF; merge metadata).

filter_species_MAGs <- function(genome_metadata,
                                genome_annotations,
                                mags_to_remove,
                                match_on_version = TRUE) {
  if (is.null(genome_metadata) || is.null(genome_annotations)) {
    stop("Both genome_metadata and genome_annotations must be provided.")
  }

  # ---------------------------
  # Helpers
  # ---------------------------

  # Remove extensions and directories
  clean_label <- function(x) {
    x <- basename(x)
    sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
  }

  # Extract (GCA|GCF) and numeric core (e.g., 015667075.1) from an ID
  # Handles optional GB_/RS_ prefixes; returns NA when not matching.
  extract_accession_fields <- function(id_vec, match_on_version = TRUE) {
    id_vec <- clean_label(id_vec)
    core_re <- if (match_on_version) "(\\d+(?:\\.\\d+)?)" else "(\\d+)"
    pat <- paste0("^(?:GB_|RS_)?(GC[AF])_", core_re, "$")

    matched <- grepl(pat, id_vec, perl = TRUE)
    acc_type <- ifelse(matched, sub(pat, "\\1", id_vec, perl = TRUE), NA_character_)
    acc_core <- ifelse(matched, sub(pat, "\\2", id_vec, perl = TRUE), NA_character_)

    data.frame(acc_type = acc_type, acc_core = acc_core, stringsAsFactors = FALSE)
  }

  # First non-NA helper (after we normalize empty "" to NA for characters)
  first_non_na <- function(x) {
    idx <- which(!is.na(x))
    if (length(idx)) x[idx[1]] else NA
  }

  # Normalize empty strings to NA for character columns
  na_if_empty_chars <- function(df) {
    df %>%
      mutate(across(where(is.character), ~ na_if(., "")))
  }

  # ---------------------------
  # Standardize removal list
  # ---------------------------
  mags_to_remove_clean <- clean_label(mags_to_remove)

  # ---------------------------
  # 1) Preprocess genome_metadata
  # ---------------------------
  has_comp    <- "completeness"  %in% names(genome_metadata)
  has_cont    <- "contamination" %in% names(genome_metadata)
  has_contigs <- "contigs"       %in% names(genome_metadata)

  gm0 <- genome_metadata %>%
    mutate(ID_clean = clean_label(ID)) %>%
    # Make factors safe to merge
    mutate(across(where(is.factor), as.character)) %>%
    na_if_empty_chars()

  # Remove wrong-species MAGs
  gm0 <- gm0 %>%
    filter(!ID_clean %in% mags_to_remove_clean)

  # Add accession parse fields
  acc <- extract_accession_fields(gm0$ID_clean, match_on_version = match_on_version)
  gm1 <- gm0 %>%
    mutate(acc_type = acc$acc_type,
           acc_core = acc$acc_core)

  # Split into accessioned (GCA/GCF) and other IDs
  gm_acc  <- gm1 %>% filter(!is.na(acc_core))
  gm_other <- gm1 %>% filter(is.na(acc_core))

  # For ordering by quality
  gm_acc <- gm_acc %>%
    mutate(
      completeness  = if (has_comp)    suppressWarnings(as.numeric(completeness))  else NA_real_,
      contamination = if (has_cont)    suppressWarnings(as.numeric(contamination)) else NA_real_,
      contigs       = if (has_contigs) suppressWarnings(as.numeric(contigs))       else NA_real_
    )

  # Collapse GCA/GCF duplicates by acc_core:
  # - Prefer GCF over GCA if present
  # - Within type, prefer higher completeness, lower contamination, fewer contigs
  # - Per column, take first non-NA after ordering (merges metadata)
  gm_acc_collapsed <- gm_acc %>%
    group_by(acc_core) %>%
    group_modify(function(df, key) {
      # Prefer GCF if present
      prefer_gcf <- any(df$acc_type == "GCF")
      pref_rank <- if (prefer_gcf) ifelse(df$acc_type == "GCF", 0L, 1L) else 0L

      # Order rows by (preference, quality)
      ord <- order(
        pref_rank,
        -ifelse(is.na(df$completeness), -Inf, df$completeness),
        ifelse(is.na(df$contamination), Inf, df$contamination),
        ifelse(is.na(df$contigs), Inf, df$contigs),
        na.last = TRUE
      )
      dfo <- df[ord, , drop = FALSE]

      # Summarize to one row by taking first non-NA per column
      # (ID will become the preferred row's ID; metadata merged)
      out <- dfo %>%
        summarise(across(everything(), first_non_na), .groups = "drop")

      out
    }) %>%
    ungroup()

  # For non-accessioned rows: fall back to your quality-based dedup by ID_clean
  if (nrow(gm_other) > 0) {
    gm_other <- gm_other %>%
      mutate(
        completeness  = if (has_comp)    suppressWarnings(as.numeric(completeness))  else NA_real_,
        contamination = if (has_cont)    suppressWarnings(as.numeric(contamination)) else NA_real_,
        contigs       = if (has_contigs) suppressWarnings(as.numeric(contigs))       else NA_real_
      ) %>%
      arrange(
        ID_clean,
        desc(if (has_comp) completeness else 0),
        (if (has_cont) contamination else 0),
        (if (has_contigs) contigs else Inf)
      ) %>%
      distinct(ID_clean, .keep_all = TRUE)
  }

  # Combine collapsed accessioned rows and deduped others
  genome_metadata_dedup <- bind_rows(gm_acc_collapsed, gm_other) %>%
    # Drop temp fields
    dplyr::select(-ID_clean, -acc_type, -acc_core)

  # ---------------------------
  # 2) Filter genome_annotations to match remaining genomes
  # ---------------------------

  ga0 <- genome_annotations %>%
    mutate(genome_clean = clean_label(genome))

  # Remove wrong-species MAGs
  genome_annotations_filtered <- ga0 %>%
    filter(!genome_clean %in% mags_to_remove_clean)

  # Keep only genomes still present in metadata (after GCF/GCA merge)
  valid_genomes <- clean_label(genome_metadata_dedup$ID)

  genome_annotations_filtered <- genome_annotations_filtered %>%
    filter(genome_clean %in% valid_genomes) %>%
    dplyr::select(-genome_clean)

  # ---------------------------
  # 3) Return cleaned objects
  # ---------------------------
  list(
    metadata    = genome_metadata_dedup,
    annotations = genome_annotations_filtered
  )
}

11.1.9 Process one species

process_one_species <- function(species, mags_to_remove = character(0)) {
  files <- load_species_files(species)

  ehi  <- prep_ehi_metadata(files$ehi_metadata)
  gtdb <- prep_gtdb_final(files$gtdb_final)
  ncbi <- prep_ncbi_metadata(files$ncbi_metadata)

  meta <- combine_metadata_sources(ehi, gtdb, ncbi) %>%
    infer_host_metadata() %>%
    normalize_country_fields()   

  if (is.null(files$contig_map) || is.null(files$gene_annotations)) {
    warning(sprintf(
      "[%s] Missing %s; returning metadata only.",
      species,
      if (is.null(files$contig_map) && is.null(files$gene_annotations)) "contig_map and gene_annotations"
      else if (is.null(files$contig_map)) "contig_map"
      else "gene_annotations"
    ))
    return(list(metadata = meta, annotations = NULL))
  }

  ann <- prep_gene_annotations_with_genome(
    ann_raw        = files$gene_annotations,
    contig_map_raw = files$contig_map
  )

  cleaned <- filter_species_MAGs(
    genome_metadata    = meta,
    genome_annotations = ann,
    mags_to_remove     = mags_to_remove
  )

  cleaned
}
# sp <- "lactococcus_lactis"
# res <- process_one_species(sp)
# 
# genome_metadata <- res$metadata
# 
# write_tsv(genome_metadata, "data/mags_metadata/lactococcus_lactis_FINAL_metadata.tsv")

11.2 DREP ANALYSIS

11.2.1 Recreating the drep tree

Checking for outliers

## Function to recreate the drep tree from the secondary alignment (FASTANI)

recreate_drep_tree <- function(ndb){
  genomes <- unique(c(ndb$reference, ndb$querry))

# Create empty matrix
ani_matrix <- matrix(0, nrow = length(genomes), ncol = length(genomes))
rownames(ani_matrix) <- genomes
colnames(ani_matrix) <- genomes

# Fill the matrix
for(i in 1:nrow(ndb)) {
  ref <- ndb$reference[i]
  qry <- ndb$querry[i]
  ani_val <- ndb$ani[i]
  
  ani_matrix[ref, qry] <- ani_val
  ani_matrix[qry, ref] <- ani_val  # Make symmetric
}

# Set diagonal to 100
diag(ani_matrix) <- 100

# Convert to distance matrix (for tree building)
dist_matrix <- as.dist(1 - ani_matrix)

# Build tree
hc <- hclust(dist_matrix, method = "average") #drep uses hierarchical clustering
tree <- as.phylo(hc)

ggtree(tree) + 
  geom_tiplab(size = 2) +
  theme_tree2()
  
}
ndb <- read_csv("data/mags_metadata/phocaeicola_vulgatus_Ndb.csv") 
Rows: 651250 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): reference, querry
dbl (3): ani, alignment_coverage, primary_cluster

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
recreate_drep_tree(ndb)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

11.2.2 Drep final tree, matrix and plots

# helper: strip file extensions so IDs match metadata
clean_label <- function(x) {
  x <- basename(x)
  sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
}


##### Drep tree metadata ---------------------------------------------------------------------------------
drep_tree_metadata <- function(genome_metadata = genome_metadata, ndb =ndb, drop_ids = character(0)){
  metadata <- genome_metadata
  id_col   <- "ID" 
  
  fastani <- ndb %>%
  dplyr::select(reference, querry, ani) %>%
  dplyr::filter(!is.na(reference), !is.na(querry), reference != querry)

  # Clean IDs to remove extensions
  fastani <- fastani %>%
    dplyr::mutate(
      reference = clean_label(reference),
      querry     = clean_label(querry)
    ) %>%
    dplyr::filter(reference != querry)
  
  # Remove duplicate pairs (keep the max ANI per pair)
  fastani <- fastani %>%
    dplyr::group_by(reference, querry) %>%
    dplyr::summarise(ani = max(ani), .groups = "drop")

  # Optionally drop genomes (IDs without suffix)
  if (length(drop_ids) > 0) {
    fastani <- fastani %>%
      dplyr::filter(!reference %in% drop_ids, !querry %in% drop_ids)
  }
  
  # Build a symmetric ANI matrix with diag = 1 (dedup-safe)
    genomes <- sort(unique(c(fastani$reference, fastani$querry)))
  
    both <- dplyr::bind_rows(
      fastani %>% dplyr::transmute(reference, querry, ani),
      fastani %>% dplyr::transmute(reference = querry, querry = reference, ani)
    ) %>%
      dplyr::distinct(reference, querry, .keep_all = TRUE)
  
    ani_mat <- both %>%
      tidyr::complete(reference = genomes, querry = genomes) %>%
      tidyr::pivot_wider(
        names_from  = querry,
        values_from = ani,
        values_fn   = max  # <-- ensure no duplicate cells cause errors
      ) %>%
      tibble::column_to_rownames("reference") %>%
      as.matrix()
  
    diag(ani_mat) <- 1

    
    # Build the tree (UPGMA/average as in dRep)
      dist_mat <- 1 - ani_mat
      hc <- hclust(as.dist(dist_mat), method = "average")
      tree <- ape::as.phylo(hc)

    # De-duplicate metadata by ID and clean ID if needed
      metadata_dedup <- metadata %>%
      dplyr::mutate(!!id_col := clean_label(.data[[id_col]])) %>%  # safe if already clean
      dplyr::distinct(.data[[id_col]], .keep_all = TRUE)
    
    # Add metadata to tips (tree labels are already clean IDs)
       stopifnot(id_col %in% names(metadata_dedup))
      
        tip_df <- tibble::tibble(label = tree$tip.label) %>%
        dplyr::left_join(metadata_dedup, by = setNames(id_col, "label")) %>%
        dplyr::mutate(label_clean = label) 
        
        
       return(list(
      tree = tree,
      tip_df = tip_df,
      ani_matrix = ani_mat
    ))
  
}

  

####-----------------------------------------------------------------------------------------

# Helper for ANI axis labels (unchanged)
ani_axis <- function(p, tree, show_threshold = NULL) {
  df <- ggtree::fortify(tree)
  max_x <- max(df$x[df$isTip])
  p <- p +
    scale_x_continuous(
      labels = function(x) round(100 * (1 - (max_x - x)), 1)
    ) +
    coord_cartesian(xlim = c(0, max_x + 0.005)) +
    labs(x = "Average Nucleotide Identity (ANI, %)") +
    theme_tree2()
  if (!is.null(show_threshold)) {
    thr_delta <- 1 - (show_threshold / 100)
    p <- p + geom_vline(xintercept = max_x - thr_delta,
                        linetype = "dashed", color = "red", size = 0.8)
  }
  p
}


### --------------PLOTS ----------------------------------------
# - BASIC PLOT --
plot_tree_basic <- function(tree, tip_df, color_by = NULL, label_tips = TRUE,
                            point_size = 2.5, show_threshold = NULL) {
  # ensure completeness numeric for continuous gradients
  if ("completeness" %in% names(tip_df)) {
    tip_df <- tip_df %>% dplyr::mutate(completeness = as.numeric(completeness))
  }

  p <- ggtree(tree, size = 0.8)
  p <- p %<+% tip_df

  if (!is.null(color_by) && color_by %in% names(tip_df)) {
    p <- p + geom_tippoint(aes(color = !!rlang::sym(color_by)), size = point_size)

    # Palette/scale logic
    if (identical(color_by, "source") && exists("source_colors")) {
      p <- p + scale_color_manual(values = source_colors, name = "Source", drop = FALSE)
    } else if (identical(color_by, "host_order") && exists("host_order_colors")) {
      p <- p + scale_color_manual(values = host_order_colors, name = "Host order", drop = FALSE)
    } else if (is.numeric(tip_df[[color_by]])) {
      p <- p + scale_color_gradient(low = "white", high = "#08306B",
                                    name = paste0(color_by, " (%)"))
    }
  } else {
    p <- p + geom_tippoint(size = point_size)
  }

  if (label_tips) {
    lab_col <- if ("label_clean" %in% names(tip_df)) "label_clean" else "label"
    p <- p + geom_tiplab(size = 3, hjust = -0.1, aes(label = .data[[lab_col]]))
  }

  ani_axis(p, tree, show_threshold = show_threshold)
}
sp <- "phocaeicola_vulgatus"
res <- process_one_species(sp)
Warning in process_one_species(sp): [phocaeicola_vulgatus] Missing gene_annotations; returning metadata only.
genome_metadata <- res$metadata
phocaeicola_results <- drep_tree_metadata(genome_metadata, ndb, drop_ids = c("GCA_040912025.1", "GCF_048453085.1", "GCA_048453085.1"))

tree <- phocaeicola_results$tree
tip_df <- phocaeicola_results$tip_df
ani_matrix <- phocaeicola_results$ani_matrix


p_host_order <- plot_tree_basic(tree, tip_df, color_by = "host_order",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_host_order

11.2.3 PERMANOVA, PcoA

#-------------1) Filter out unkown host type ------------------
filter_metadata <- function(tip_df) {
  tip_df %>%
    dplyr::filter(!is.na(host_type) & host_type != "")
}

#-------------2) Subset and align (same order) ------------------
subset_and_align_ani <- function(ani_matrix, meta_df) {
  keep_ids <- intersect(rownames(ani_matrix), meta_df$label)
  
  ani_mat_f <- ani_matrix[keep_ids, keep_ids, drop = FALSE]
  
  meta_f <- meta_df %>%
    dplyr::filter(label %in% keep_ids) %>%
    dplyr::arrange(match(label, rownames(ani_mat_f)))
  
  stopifnot(identical(meta_f$label, rownames(ani_mat_f)))
  
  list(ani = ani_mat_f, meta = meta_f)
}

#-------------3) Run PERMANOVA ------------------
run_permanova <- function(dist_mat, meta_f) {
  set.seed(1)
  ad <- vegan::adonis2(dist_mat ~ host_type, data = meta_f, permutations = 999)
  bd <- vegan::betadisper(dist_mat, meta_f$host_type)
  list(adonis = ad, dispersion = anova(bd))
}

#-------------4) Run PCOA ------------------
run_pcoa <- function(dist_mat, meta_f) {
  pcoa_res <- ape::pcoa(dist_mat)
  data.frame(
    PC1 = pcoa_res$vectors[, 1],
    PC2 = pcoa_res$vectors[, 2],
    host_type = meta_f$host_type,
    host_order = meta_f$host_order
  )
}

#-------------5) Plots PCOA ------------------
plot_pcoa_host_type <- function(pcoa_df, host_type_colors) {
  ggplot2::ggplot(pcoa_df, ggplot2::aes(PC1, PC2, color = host_type)) +
    ggplot2::geom_point(size = 2, alpha = 0.9) +
    ggplot2::scale_color_manual(values = host_type_colors, name = "Host type") +
    ggplot2::theme_minimal(base_size = 12) +
    ggplot2::labs(title = "PCoA of ANI distances (filtered to known host_type)")
}

plot_pcoa_host_order <- function(pcoa_df, host_order_colors) {
  ggplot2::ggplot(pcoa_df, ggplot2::aes(PC1, PC2, color = host_order)) +
    ggplot2::geom_point(size = 2, alpha = 0.9) +
    ggplot2::scale_color_manual(values = host_order_colors, name = "Host order") +
    ggplot2::theme_minimal(base_size = 12) +
    ggplot2::labs(title = "PCoA of ANI distances")
}

#-------------5) Pairwise ANI ------------------
pairwise_ani <- function(ani_mat_f, meta_f) {
  ani_mat_f %>%
    as.data.frame() %>%
    tibble::rownames_to_column("ref") %>%
    tidyr::pivot_longer(-ref, names_to = "qry", values_to = "ani") %>%
    dplyr::filter(ref < qry) %>%  # unique pairs
    dplyr::mutate(
      ref_host = meta_f$host_type[match(ref, meta_f$label)],
      qry_host = meta_f$host_type[match(qry, meta_f$label)],
      pair_type = dplyr::case_when(
        ref_host == "human"  & qry_host == "human"  ~ "human–human",
        ref_host == "animal" & qry_host == "animal" ~ "animal–animal",
        TRUE ~ "animal–human"
      )
    )
}


#-------------6) Plot pairwise ANI boxplot ------------------
plot_pairwise_boxplot <- function(ani_long_f) {
  ggplot2::ggplot(ani_long_f, ggplot2::aes(pair_type, ani, fill = pair_type)) +
    ggplot2::geom_boxplot(outlier_size = 0.5, width = 0.7) +
    ggplot2::scale_fill_viridis_d(guide = "none") +
    ggplot2::theme_minimal(base_size = 12) +
    ggplot2::labs(x = NULL, y = "ANI", title = "Within- and between-group ANI (known host_type only)")
}


#-------------7) Wilcoxon tests ------------------
wilcox_within <- function(ani_long_f) {
  subset_within <- ani_long_f %>%
    dplyr::filter(pair_type %in% c("animal–animal", "human–human"))

  wt <- stats::wilcox.test(ani ~ pair_type, data = subset_within, exact = FALSE)

  summary_df <- subset_within %>%
    dplyr::group_by(pair_type) %>%
    dplyr::summarise(n = dplyr::n(), median_ani = stats::median(ani), .groups = "drop")

  list(test = wt, summary = summary_df)
}

wilcox_cross_vs_within <- function(ani_long_f) {
  # Compare cross-host to pooled within (animal–animal + human–human)
  df <- ani_long_f %>%
    dplyr::mutate(is_cross = ifelse(pair_type == "animal–human", "cross", "within"))

  wt <- stats::wilcox.test(ani ~ is_cross, data = df, exact = FALSE, alternative = "less")
  # 'less' tests if cross < within (lower ANI between hosts)

  summary_df <- df %>%
    dplyr::group_by(is_cross) %>%
    dplyr::summarise(n = dplyr::n(), median_ani = stats::median(ani), .groups = "drop")

  list(test = wt, summary = summary_df)
}



#----------------------------------------------------#

#-------------- DREP STATS ANALYSIS ------------------
# combines the functions above into a big one


drep_stat_analysis <- function(tip_df, ani_matrix, host_type_colors, host_order_colors) {
  tip_df_f <- tip_df %>%
    dplyr::filter(!is.na(host_type) & host_type != "")

  aligned <- subset_and_align_ani(ani_matrix, tip_df_f)
  ani_mat_f <- aligned$ani
  meta_f <- aligned$meta

  if (is.factor(meta_f$host_type)) meta_f$host_type <- droplevels(meta_f$host_type)

  dist_mat_f <- stats::as.dist(1 - ani_mat_f)

  permanova_results <- run_permanova(dist_mat_f, meta_f)

  pcoa_df <- run_pcoa(dist_mat_f, meta_f)
  pcoa_host_type_plot <- plot_pcoa_host_type(pcoa_df, host_type_colors)
  pcoa_host_order_plot <- plot_pcoa_host_order(pcoa_df, host_order_colors)

  ani_long_f <- pairwise_ani(ani_mat_f, meta_f)
  boxplot_pairs <- plot_pairwise_boxplot(ani_long_f)

  w_within <- wilcox_within(ani_long_f)
  w_cross  <- wilcox_cross_vs_within(ani_long_f)

  list(
    filtered_metadata = meta_f,
    ani = ani_mat_f,
    dist = dist_mat_f,
    permanova = permanova_results,
    pcoa = list(data = pcoa_df,
                by_host_type = pcoa_host_type_plot,
                by_host_order = pcoa_host_order_plot),
    pairwise = list(data = ani_long_f,
                    boxplot = boxplot_pairs,
                    wilcox_within = w_within,
                    wilcox_cross_vs_within = w_cross)
  )
}
res <- drep_stat_analysis(
  tip_df = tip_df,
  ani_matrix = ani_matrix,
  host_type_colors = host_type_colors,
  host_order_colors = host_order_colors
)
Warning in ggplot2::geom_boxplot(outlier_size = 0.5, width = 0.7): Ignoring unknown parameters: `outlier_size`
# Print stats
res$permanova$adonis
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = dist_mat ~ host_type, data = meta_f, permutations = 999)
          Df SumOfSqs      R2      F Pr(>F)    
Model      1 0.000562 0.00922 7.4576  0.001 ***
Residual 801 0.060367 0.99078                  
Total    802 0.060929 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res$permanova$dispersion
Analysis of Variance Table

Response: Distances
           Df     Sum Sq    Mean Sq F value Pr(>F)
Groups      1 0.00000405 4.0517e-06  1.1564 0.2825
Residuals 801 0.00280654 3.5038e-06               
res$pairwise$wilcox_within$test

    Wilcoxon rank sum test with continuity correction

data:  ani by pair_type
W = 298130877, p-value = 2.073e-05
alternative hypothesis: true location shift is not equal to 0
res$pairwise$wilcox_cross_vs_within$test

    Wilcoxon rank sum test with continuity correction

data:  ani by is_cross
W = 6415885806, p-value < 2.2e-16
alternative hypothesis: true location shift is less than 0
# Plot
print(res$pcoa$by_host_type)

print(res$pcoa$by_host_order)

print(res$pairwise$boxplot)

# Save plots:
# ggplot2::ggsave("pcoa_host_type.png", res$pcoa$by_host_type, width = 6, height = 5, dpi = 300)

11.3 FUNCTIONAL ANALYSIS: KEGG, AMR, VF, CAZY, DEFENSE, GIFTS

11.3.1 Create annotation matrices

#testing with hafnia
sp <- "citrobacter_braakii"
res <- process_one_species(sp)

hp <- res$metadata
genome_annotations <- res$annotations
genome_metadata <- hp



make_annotation_matrix <- function(genome_annotations, column) {
  genome_annotations %>%
    filter(!is.na(.data[[column]])) %>%
    count(genome, annotation = .data[[column]]) %>%
    tidyr::pivot_wider(
      names_from = annotation,
      values_from = n,
      values_fill = 0
    )
}


#testing
kegg_mat <- make_annotation_matrix(genome_annotations, "kegg")
amr_mat  <- make_annotation_matrix(genome_annotations, "resistance_target")
vf_mat   <- make_annotation_matrix(genome_annotations, "vf")

11.3.2 Run Fisher tests on proportions

run_fisher_tests <- function(mat, metadata, annotation_name = "feature") {
  # mat: wide matrix with columns: genome, <annotation columns...>
  # metadata: data frame with columns ID, host_type
  # annotation_name: name for the long column (e.g., "kegg", "amr")

  # checks
  if (!"genome" %in% names(mat)) {
    stop("Input 'mat' must contain a 'genome' column.")
  }
  if (!all(c("ID", "host_type") %in% names(metadata))) {
    stop("Input 'metadata' must contain columns 'ID' and 'host_type'.")
  }

  # Presence/absence conversion (keep genome as-is; non-NA and non-zero -> 1 else 0)
  pa <- mat %>%
    dplyr::mutate(
      dplyr::across(
        -genome,
        ~ as.integer(!is.na(.) & . != 0)
      )
    )

  # Wide -> long
  pa_long <- pa %>%
    tidyr::pivot_longer(
      cols = -genome,
      names_to = annotation_name,   # creates a column with this exact name
      values_to = "present"
    ) %>%
    dplyr::filter(present == 1L) %>%
    dplyr::select(genome, dplyr::all_of(annotation_name)) %>%  # keep genome + annotation
    dplyr::distinct()

  # Attach host_type
  pa_with_host <- pa_long %>%
    dplyr::left_join(
      metadata %>% dplyr::select(ID, host_type),
      by = c("genome" = "ID")
    )

  # Count how many MAGs present in each annotation by host_type
  pa_mag_counts <- pa_with_host %>%
    dplyr::group_by(host_type, .data[[annotation_name]]) %>%
    dplyr::summarise(n_mags = dplyr::n(), .groups = "drop")

  # Totals per host_type (denominator)
  total_mags_per_host_type <- metadata %>%
    dplyr::group_by(host_type) %>%
    dplyr::summarise(total_mags = dplyr::n_distinct(ID), .groups = "drop")

  # Proportions + absences
  pa_mag_proportions <- pa_mag_counts %>%
    dplyr::left_join(total_mags_per_host_type, by = "host_type") %>%
    dplyr::mutate(
      proportion = n_mags / total_mags,
      absent     = total_mags - n_mags
    )

  # Build the wide counts matrix (animal vs human)
  pa_matrix <- pa_mag_proportions %>%
    dplyr::select(dplyr::all_of(annotation_name), host_type, n_mags, absent) %>%
    tidyr::pivot_wider(
      names_from  = host_type,
      values_from = c(n_mags, absent),
      values_fill = 0
    )

  # Sanity check for expected host groups (animal/human)
  required_cols <- c(
    annotation_name,
    "n_mags_animal", "absent_animal",
    "n_mags_human",  "absent_human"
  )
  if (!all(required_cols %in% names(pa_matrix))) {
    stop(
      "Expected columns missing in pa_matrix: ",
      paste(setdiff(required_cols, names(pa_matrix)), collapse = ", "),
      "\nCheck that metadata$host_type contains 'animal' and 'human' and that both have genomes."
    )
  }

  # Fisher exact test per row (2x2)
  results <- pa_matrix %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      ft = list(stats::fisher.test(
        matrix(
          c(n_mags_animal, absent_animal,
            n_mags_human,  absent_human),
          nrow = 2, byrow = TRUE
        )
      )),
      p_value    = ft$p.value,
      odds_ratio = unname(ft$estimate),
      log2_or = log2(odds_ratio),
      conf_low   = ft$conf.int[1],
      conf_high  = ft$conf.int[2]
    ) %>%
    dplyr::ungroup() %>%
    dplyr::select(-ft) %>%
    dplyr::mutate(
      p_adj       = stats::p.adjust(p_value, method = "BH"),
      prop_animal = n_mags_animal / (n_mags_animal + absent_animal),
      prop_human  = n_mags_human  / (n_mags_human  + absent_human),
      diff_prop   = prop_human - prop_animal
    )

  return(results)
}
kegg_fisher <- run_fisher_tests(kegg_mat, genome_metadata, annotation_name="kegg")
amr_fisher <- run_fisher_tests(amr_mat, genome_metadata, annotation_name="amr")

11.3.3 Volcano plot

plot_volcano <- function(df, feature_col) {
  df %>%
    mutate(
      p_adj_capped = pmax(p_adj, 1e-300),
      nl10 = -log10(p_adj_capped),
      sig = p_adj < 0.05
    ) %>%
    ggplot(aes(x = log2_or, y = nl10)) +
    geom_point(aes(color = sig), size = 2) +
    scale_color_manual(values = c("grey70", "blue")) +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
    ggrepel::geom_text_repel(
      data = . %>% filter(sig) %>% slice_max(nl10, n = 10),
      aes(label = .data[[feature_col]])
    ) +
    theme_minimal()
}

plot_volcano(kegg_fisher, "kegg")

plot_volcano(amr_fisher, "amr")

run_pca <- function(mat) {
  m <- mat %>% column_to_rownames("genome")
  m <- m / rowSums(m)
  m <- m[, apply(m, 2, sd) > 0]
  
  prcomp(m, scale. = TRUE)
}

pca_kegg <- run_pca(kegg_mat)
pca_amr  <- run_pca(amr_mat)

11.3.4 PERMANOVA on presence/absence matrix´

To check if the groups (animal vs human-sourced MAGs) have differences overall.

pa_permanova <- function(mat, metadata) {
  mat <- mat %>%
  column_to_rownames("genome")

  pa <- (mat > 0) * 1

  
  # remove zero-variance KOs
  pa_nz <- pa[, colSums(pa) > 0 & colSums(pa) < nrow(pa)]
  
  # Determine the common genomes
  common_ids <- base::intersect(rownames(pa_nz), metadata$ID)
  
  # Report what will be kept/dropped
  message("# common: ", length(common_ids))
  message("# in KEGG only: ", length(setdiff(rownames(pa_nz), metadata$ID)))
  message("# in metadata only: ", length(setdiff(metadata$ID, rownames(pa_nz))))
  
  # Subset to the intersection (and keep order identical)
  pa_nz <- pa_nz[common_ids, , drop = FALSE]
  
  meta <- metadata %>%
    dplyr::filter(ID %in% common_ids) %>%
    dplyr::distinct(ID, .keep_all = TRUE) %>%
    tibble::column_to_rownames("ID") %>%
    .[common_ids, , drop = FALSE]
  
  stopifnot(identical(rownames(meta), rownames(pa_nz)))
  
  # Prepare variables for PERMANOVA
  required_vars <- c("genome_size", "completeness", "host_type")
  
  # Coerce types as needed
  meta <- meta %>%
    dplyr::mutate(
      genome_size  = as.numeric(genome_size),
      completeness = as.numeric(completeness),
      host_type    = as.factor(host_type)
    )
  
  # Align on complete cases (adonis2 drops NAs otherwise)
  ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
  pa_nz <- pa_nz[ok, , drop = FALSE]
  meta       <- meta[ok, , drop = FALSE]
  stopifnot(identical(rownames(meta), rownames(pa_nz)))
  
  # Distance, dispersion, PERMANOVA
  dist_pa <- vegan::vegdist(pa_nz, method = "jaccard", binary = TRUE)
  
  betadisp_results <- vegan::betadisper(dist_pa, meta$host_type)
  
  
 permanova_results <- vegan::adonis2(
    dist_pa ~ genome_size + completeness + host_type,
    data = meta,
    permutations = 999,
    by = "margin"
  )
 
 results <- list(
    dist           = dist_pa,
    betadisper     = betadisp_results,
    permanova      = permanova_results,
    pa_nz = pa_nz)
 
  return(results)
}

results <-pa_permanova(kegg_mat, genome_metadata)
# common: 30
# in KEGG only: 0
# in metadata only: 0
results$permanova
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)  
genome_size   1  0.11642 0.05898 2.0238  0.039 *
completeness  1  0.20836 0.10555 3.6221  0.021 *
host_type     1  0.09819 0.04974 1.7070  0.063 .
Residual     26  1.49567 0.75769                
Total        29  1.97399 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

11.4 Define species list

# ALL SPECIES
species_list <- c(
  "lactococcus_lactis",
  "parabacteroides_goldsteinii",
  "enterococcus_faecalis",
  "bacteroides_uniformis",
  "phocaeicola_vulgatus",
  "hafnia_paralvei",
  "citrobacter_braakii",
  "enterococcus_hirae",
  "akkermansia_muciniphila",
  "bacteroides_fragilis"
)


mags_to_remove_list <- list(
  lactococcus_lactis = c(
    "GCA_018369575.1.fna","GCA_947063445.1.fna","GCA_947101685.1.fna",
    "GCA_948698275.1.fna","GCA_937910935.1.fna","GCA_947072755.1.fna",
    "GCA_948655095.1.fna","GCA_948703095.1.fna","GCA_947041925.1.fna",
    "GCA_947073355.1.fna","GCA_948675165.1.fna","GCA_948718815.1.fna"
  ),
  parabacteroides_goldsteinii = character(0), # none to remove
  enterococcus_faecalis = character(0),
  hafnia_paralvei = character(0),
  bacteroides_uniformis = character(0),
  phocaeicola_vulgatus = c("GCA_040912025.1", "GCF_048453085.1", "GCA_048453085.1"),
  citrobacter_braakii = character(0),
  enterococcus_hirae = character(0),
  akkermansia_muciniphila = character(0),
  bacteroides_fragilis = character(0)
)

# pg <- results[["parabacteroides_goldsteinii"]]$metadata
# eh <- results[["enterococcus_hirae"]]$metadata
# pv <- results[["phocaeicola_vulgatus"]]$metadata
# am <- results[["akkermansia_muciniphila"]]$metadata
# bf <- results[["bacteroides_fragilis"]]$metadata
# bu <- results[["bacteroides_uniformis"]]$metadata
# ll <- results[["lactococcus_lactis"]]$metadata
 #hp <- results[["hafnia_paralvei"]]$metadata
species_list <- c(
  "citronacter_braakii",
  "hafnia_paralvei"
)

mags_to_remove_list <- list(
  citronacter_braakii = character(0),
  hafnia_paralvei = character(0)
)


# ---- Run for all species ----
results <- setNames(vector("list", length(species_list)), species_list)

for (sp in species_list) {
  results[[sp]] <- process_one_species(
    species        = sp,
    mags_to_remove = mags_to_remove_list[[sp]]
  )
}
Warning in process_one_species(species = sp, mags_to_remove = mags_to_remove_list[[sp]]): [citronacter_braakii]
Missing contig_map and gene_annotations; returning metadata only.
# Examples:
# results[["lactococcus_lactis"]]$metadata
# results[["lactococcus_lactis"]]$annotations

11.5 Testing functions with one species

sp <- "citrobacter_braakii"
res <- process_one_species(sp)

genome_metadata <- res$metadata
gene_annotations <- res$annotations

ndb <- read_csv("data/mags_metadata/citrobacter_braakii_Ndb.csv") #include in process one species
Rows: 1681 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): reference, querry
dbl (3): ani, alignment_coverage, primary_cluster

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
recreate_drep_tree(ndb)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'

drep_results <- drep_tree_metadata(genome_metadata, ndb)

tree <- drep_results$tree
tip_df <- drep_results$tip_df
ani_matrix <- drep_results$ani_matrix


p_host_order <- plot_tree_basic(tree, tip_df, color_by = "host_order",
                          label_tips = TRUE, show_threshold = 99.5)
Warning in fortify(data, ...): Arguments in `...` must be used.
✖ Problematic arguments:
• as.Date = as.Date
• yscale_mapping = yscale_mapping
• hang = hang
• size = 0.8
ℹ Did you misspell an argument name?
Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
Also defined by 'tidytree'
p_host_order

# Build matrices
kegg_mat <- make_annotation_matrix(genome_annotations, "kegg")
amr_mat  <- make_annotation_matrix(genome_annotations, "resistance_type")

# Fisher + volcano
kegg_fisher <- run_fisher_tests(kegg_mat, genome_metadata, "kegg")
amr_fisher  <- run_fisher_tests(amr_mat, genome_metadata, "amr")

plot_volcano(kegg_fisher, "kegg")

plot_volcano(amr_fisher, "amr")

# PCA
pca_kegg <- run_pca(kegg_mat)
pca_amr  <- run_pca(amr_mat)

# PERMANOVA
results <- pa_permanova(kegg_mat, genome_metadata)
# common: 30
# in KEGG only: 0
# in metadata only: 0
results$dist
                  EHM013856   EHM016687   EHM018416   EHM019588   EHM025899   EHM034508   EHM036539   EHM042977
EHM016687       0.322901849                                                                                    
EHM018416       0.307922272 0.167241379                                                                        
EHM019588       0.257142857 0.250759878 0.247619048                                                            
EHM025899       0.398097826 0.343283582 0.328616352 0.343884892                                                
EHM034508       0.573643411 0.527738265 0.561863173 0.528043776 0.574380165                                    
EHM036539       0.491612903 0.441926346 0.473913043 0.441256831 0.491083676 0.132969035                        
EHM042977       0.347119645 0.226351351 0.089523810 0.300623053 0.361934477 0.595100865 0.507890961            
EHM043923       0.293948127 0.210272873 0.202020202 0.070116861 0.318250377 0.524822695 0.437057992 0.259075908
EHM050060       0.320781032 0.221003135 0.242326333 0.239520958 0.343065693 0.525730181 0.441828255 0.274478331
EHM055692       0.286501377 0.131955485 0.253869969 0.277620397 0.388203018 0.556430446 0.475130890 0.300304878
EHM066268       0.314084507 0.212361331 0.227868852 0.257824143 0.318926975 0.529329609 0.442896936 0.277419355
EHM079210       0.617500000 0.635526316 0.649386085 0.605433376 0.668806162 0.660167131 0.571627260 0.663474692
EHM081080       0.256302521 0.255192878 0.257716049 0.225947522 0.359832636 0.539787798 0.457671958 0.288343558
EHM082134       0.415422886 0.441138422 0.441453567 0.386010363 0.466921120 0.426183844 0.337969402 0.475366178
EHM082624       0.283159463 0.244732577 0.232081911 0.241001565 0.368975904 0.574257426 0.486600846 0.254266212
GCF_014230105.1 0.323258870 0.263309353 0.291605302 0.311231394 0.366711773 0.546854942 0.467349552 0.337681159
GCF_018138525.1 0.277074543 0.272047833 0.288580247 0.255102041 0.336690647 0.545087483 0.459677419 0.321592649
GCF_030131705.1 0.259970458 0.187908497 0.143356643 0.165869219 0.312024353 0.507936508 0.420977011 0.212947189
GCF_030357945.1 0.335164835 0.268181818 0.276729560 0.266471449 0.341074020 0.552845528 0.470985155 0.318322981
GCF_036957665.1 0.380890052 0.380027739 0.377713459 0.364005413 0.450928382 0.623425693 0.545226131 0.401154401
GCF_046518655.1 0.379076087 0.296969697 0.279552716 0.326149425 0.195826645 0.555096419 0.469780220 0.311111111
GCF_046555535.1 0.295389049 0.237341772 0.236363636 0.250000000 0.301829268 0.540730337 0.455944056 0.269293924
GCF_046555695.1 0.296402878 0.238546603 0.237623762 0.251134644 0.302891933 0.541374474 0.456703911 0.270491803
GCF_046555735.1 0.333793103 0.250384025 0.258373206 0.297258297 0.382102273 0.532413793 0.444903581 0.311424100
GCF_046559175.1 0.325681492 0.231884058 0.221658206 0.276853253 0.338880484 0.575208914 0.493074792 0.240677966
GCF_047313245.1 0.347578348 0.257188498 0.254180602 0.277016743 0.311145511 0.536023055 0.449067432 0.318107667
GCF_050782755.1 0.329113924 0.220985692 0.179966044 0.276374443 0.366423358 0.563961486 0.478737997 0.192832765
GCF_054185985.1 0.394946809 0.355491329 0.356927711 0.348739496 0.423822715 0.619480519 0.542635659 0.378947368
GCF_054186025.1 0.364806867 0.313971743 0.305280528 0.325825826 0.382530120 0.584985836 0.503516174 0.340425532
                  EHM043923   EHM050060   EHM055692   EHM066268   EHM079210   EHM081080   EHM082134   EHM082624
EHM016687                                                                                                      
EHM018416                                                                                                      
EHM019588                                                                                                      
EHM025899                                                                                                      
EHM034508                                                                                                      
EHM036539                                                                                                      
EHM042977                                                                                                      
EHM043923                                                                                                      
EHM050060       0.218750000                                                                                    
EHM055692       0.284883721 0.278335725                                                                        
EHM066268       0.229687500 0.250379363 0.280979827                                                            
EHM079210       0.639686684 0.613577023 0.639506173 0.649167734                                                
EHM081080       0.226726727 0.261939219 0.261904762 0.259475219 0.616060226                                    
EHM082134       0.424479167 0.414948454 0.434032059 0.455808081 0.498673740 0.426980198                        
EHM082624       0.216393443 0.276827372 0.314243759 0.249602544 0.658698539 0.152733119 0.468140442            
GCF_014230105.1 0.307262570 0.279329609 0.271739130 0.250000000 0.641916168 0.284366577 0.455516014 0.328611898
GCF_018138525.1 0.254135338 0.273391813 0.297071130 0.247761194 0.644555695 0.278873239 0.440149626 0.268404908
GCF_030131705.1 0.127731092 0.191387560 0.268436578 0.188102894 0.616822430 0.211567732 0.398936170 0.190635452
GCF_030357945.1 0.244648318 0.292397661 0.322222222 0.261976048 0.654088050 0.311452514 0.464019851 0.303951368
GCF_036957665.1 0.390946502 0.394347241 0.360372340 0.381991814 0.643122677 0.376315789 0.487425150 0.424033149
GCF_046518655.1 0.304804805 0.315249267 0.348611111 0.272314675 0.636835279 0.326300985 0.431145431 0.335356601
GCF_046555535.1 0.223809524 0.244992296 0.303597122 0.185897436 0.624834875 0.241430700 0.431906615 0.232520325
GCF_046555695.1 0.225039620 0.246153846 0.304597701 0.187200000 0.625329815 0.242559524 0.432642487 0.233766234
GCF_046555735.1 0.284857571 0.254872564 0.313725490 0.225460123 0.635083227 0.288352273 0.437974684 0.294478528
GCF_046559175.1 0.257503949 0.280245023 0.284660767 0.264385692 0.656618611 0.267857143 0.460645161 0.264181524
GCF_047313245.1 0.243589744 0.296183206 0.322206096 0.277950311 0.651655629 0.296023564 0.433289300 0.305732484
GCF_050782755.1 0.249221184 0.242703533 0.291486291 0.220657277 0.645077720 0.270072993 0.451530612 0.261146497
GCF_054185985.1 0.357758621 0.400275103 0.385234899 0.367231638 0.665413534 0.394666667 0.490820073 0.406025825
GCF_054186025.1 0.311424100 0.362146051 0.363505747 0.327718224 0.670650730 0.354978355 0.479843953 0.369158879
                GCF_014230105.1 GCF_018138525.1 GCF_030131705.1 GCF_030357945.1 GCF_036957665.1 GCF_046518655.1
EHM016687                                                                                                      
EHM018416                                                                                                      
EHM019588                                                                                                      
EHM025899                                                                                                      
EHM034508                                                                                                      
EHM036539                                                                                                      
EHM042977                                                                                                      
EHM043923                                                                                                      
EHM050060                                                                                                      
EHM055692                                                                                                      
EHM066268                                                                                                      
EHM079210                                                                                                      
EHM081080                                                                                                      
EHM082134                                                                                                      
EHM082624                                                                                                      
GCF_014230105.1                                                                                                
GCF_018138525.1     0.276478680                                                                                
GCF_030131705.1     0.279600571     0.223076923                                                                
GCF_030357945.1     0.303693570     0.246312684     0.232198142                                                
GCF_036957665.1     0.333333333     0.402631579     0.379166667     0.421813403                                
GCF_046518655.1     0.331967213     0.309248555     0.261609907     0.323188406     0.394021739                
GCF_046555535.1     0.252525253     0.231707317     0.190243902     0.243491577     0.392318244     0.267281106
GCF_046555695.1     0.253602305     0.232876712     0.191558442     0.244648318     0.393150685     0.268404908
GCF_046555735.1     0.259154930     0.272594752     0.241112828     0.279001468     0.358024691     0.316788321
GCF_046559175.1     0.319268636     0.223602484     0.224919094     0.235569423     0.407713499     0.270606532
GCF_047313245.1     0.290647482     0.279878971     0.204283361     0.232704403     0.397489540     0.281493002
GCF_050782755.1     0.265335235     0.314037627     0.211200000     0.308370044     0.369806094     0.311475410
GCF_054185985.1     0.381389253     0.353268428     0.342565598     0.366806137     0.336578581     0.365957447
GCF_054186025.1     0.380359613     0.325859492     0.277688604     0.327794562     0.368876081     0.307931571
                GCF_046555535.1 GCF_046555695.1 GCF_046555735.1 GCF_046559175.1 GCF_047313245.1 GCF_050782755.1
EHM016687                                                                                                      
EHM018416                                                                                                      
EHM019588                                                                                                      
EHM025899                                                                                                      
EHM034508                                                                                                      
EHM036539                                                                                                      
EHM042977                                                                                                      
EHM043923                                                                                                      
EHM050060                                                                                                      
EHM055692                                                                                                      
EHM066268                                                                                                      
EHM079210                                                                                                      
EHM081080                                                                                                      
EHM082134                                                                                                      
EHM082624                                                                                                      
GCF_014230105.1                                                                                                
GCF_018138525.1                                                                                                
GCF_030131705.1                                                                                                
GCF_030357945.1                                                                                                
GCF_036957665.1                                                                                                
GCF_046518655.1                                                                                                
GCF_046555535.1                                                                                                
GCF_046555695.1     0.001785714                                                                                
GCF_046555735.1     0.222395023     0.223602484                                                                
GCF_046559175.1     0.222580645     0.223832528     0.252713178                                                
GCF_047313245.1     0.239549839     0.240770465     0.252730109     0.270833333                                
GCF_050782755.1     0.220285261     0.221518987     0.241960184     0.278382582     0.291925466                
GCF_054185985.1     0.366237482     0.367142857     0.351694915     0.336779911     0.322338831     0.377652051
GCF_054186025.1     0.312989045     0.314062500     0.316030534     0.298076923     0.273322422     0.344036697
                GCF_054185985.1
EHM016687                      
EHM018416                      
EHM019588                      
EHM025899                      
EHM034508                      
EHM036539                      
EHM042977                      
EHM043923                      
EHM050060                      
EHM055692                      
EHM066268                      
EHM079210                      
EHM081080                      
EHM082134                      
EHM082624                      
GCF_014230105.1                
GCF_018138525.1                
GCF_030131705.1                
GCF_030357945.1                
GCF_036957665.1                
GCF_046518655.1                
GCF_046555535.1                
GCF_046555695.1                
GCF_046555735.1                
GCF_046559175.1                
GCF_047313245.1                
GCF_050782755.1                
GCF_054185985.1                
GCF_054186025.1     0.256329114
results$permanova
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)  
genome_size   1  0.11642 0.05898 2.0238  0.037 *
completeness  1  0.20836 0.10555 3.6221  0.022 *
host_type     1  0.09819 0.04974 1.7070  0.058 .
Residual     26  1.49567 0.75769                
Total        29  1.97399 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_res <- ape::pcoa(results$dist, correction = "cailliez")

pcoa_df <- data.frame(
  ID = rownames(results$pa_nz),
  PC1 = pcoa_res$vectors[, 1],
  PC2 = pcoa_res$vectors[, 2],
  host_type = genome_metadata$host_type,
  host_order = genome_metadata$host_order
)

var_exp <- round(100 * pcoa_res$values$Rel_corr_eig[1:2], 1)

pcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_type_colors, name = "Host type")+
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host type)",
    x = paste0("PC1 (",round(var_exp[1], 1), "%)"),
    y = paste0("PC2 (", round(var_exp[2], 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_order_colors, name = "Host order")+
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host order)",
    x = paste0("PC1 (",round(var_exp[1], 1), "%)"),
    y = paste0("PC2 (", round(var_exp[2], 1), "%)")
  )