Chapter 4 NCBI MAGs exploration
4.1 NCBI MAGs metadata
gut_keywords <- c("ceaca", "caeca", "excrement", "excrements", "rectal", "feces",
"stool", "stools", "gut", "cloaca", "cloacal", "fecal", "intestines",
"intestine", "intestinal", "cecum", "ileum", "jejunum", "colon", "rectum", "rumen",
"anal", "anus", "manure", "faeces")
# Wrap each keyword in word boundaries: \bkeyword\b --otherwise artisanal passes the filter
pattern <- paste0("\\b(", paste(gut_keywords, collapse = "|"), ")\\b")
# Filter the data
#gut_samples <- metadata_wide %>%
# filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))Download metadata for each taxa
# Trying to download ncbi metadata for mags
conda activate drakkar_env
conda install -c conda-forge ncbi-datasets-cli
## CITROBACTER BRAAKII
datasets summary genome taxon 57706 \
--as-json-lines > genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' genomes.jsonl > high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm
| .checkm_info.completeness as $comp
| .checkm_info.contamination as $cont
| .assembly_stats.total_sequence_length as $genome_size
| .assembly_stats.gc_percent as $gc
| .assembly_info.biosample.attributes[]?
| [$asm, $comp, $cont, $genome_size, $gc, .name, .value]
| @tsv
' high_quality_genomes.jsonl > citrobacter_biosample_attributes.tsv4.1.1 Citrobacter braakii
c_braki_metadata <- read_tsv("data/mags_metadata/citrobacter_braakii_biosample_attributes.tsv",
col_names = c(
"accession",
"completeness",
"contamination",
"genome_size",
"gc_percent",
"name",
"value"
)
)
wide <- c_braki_metadata %>%
pivot_wider(
id_cols = c(
accession,
completeness,
contamination,
genome_size,
gc_percent
),
names_from = name,
values_from = value
)
source_counts <- wide %>%
filter(!is.na(isolation_source)) %>%
dplyr::count(isolation_source, sort = TRUE)
source_counts
host_counts <- wide %>%
filter(!is.na(host)) %>%
dplyr::count(host, sort = TRUE)
host_counts
wide %>%
filter(host == "Homo sapiens")
citrobacter_gut <- wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
citrobacter_gut %>%tt()
## HUMAN
summary_counts <- citrobacter_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- citrobacter_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.2 Lactococcus lactis
## LACTOCOCCUS LACTIS
datasets summary genome taxon 1358 \
--as-json-lines > lactococcus_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' lactococcus_genomes.jsonl > lactococcus_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm
| .checkm_info.completeness as $comp
| .checkm_info.contamination as $cont
| .assembly_stats.total_sequence_length as $genome_size
| .assembly_stats.gc_percent as $gc
| .assembly_info.biosample.attributes[]?
| [$asm, $comp, $cont, $genome_size, $gc, .name, .value]
| @tsv
' lactococcus_high_quality_genomes.jsonl > lactococcus_biosample_attributes.tsvlactococcus_metadata <- read_tsv("data/mags_metadata/lactococcus_lactis_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- lactococcus_metadata %>%
filter(attribute != "assembly") %>% # assembly is in some attribute column, causing issues
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = list(value = ~ paste(.x, collapse = "; "))
)
lactococcus_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- lactococcus_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- lactococcus_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.3 Hafnia paralvei
## HAFNIA PARALVEI
datasets summary genome taxon 546367 \
--as-json-lines > hafnia_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' hafnia_genomes.jsonl > hafnia_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' hafnia_high_quality_genomes.jsonl > hafnia_biosample_attributes.tsvhafnia_metadata <- read_tsv("data/ncbi/hafnia_paralvei_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- hafnia_metadata %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
hafnia_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- hafnia_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- hafnia_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.4 Enterococcus faecalis
## ENTEROCOCCUS FAECALIS
datasets summary genome taxon 1351 \
--as-json-lines > enterococcus_faecalis_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' enterococcus_faecalis_genomes.jsonl > enterococcus_faecalis_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' enterococcus_faecalis_high_quality_genomes.jsonl > enterococcus_faecalis_biosample_attributes.tsventerococcus_faecalis_metadata <- read_tsv("data/ncbi/enterococcus_faecalis_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- enterococcus_faecalis_metadata %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
enterococcus_faecalis_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- enterococcus_faecalis_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- enterococcus_faecalis_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.5 Enterococcus hirae
## ENTEROCOCCUS HIRAE
datasets summary genome taxon 1354 \
--as-json-lines > enterococcus_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' enterococcus_genomes.jsonl > enterococcus_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' enterococcus_high_quality_genomes.jsonl > enterococcus_biosample_attributes.tsventerococcus_metadata <- read_tsv("data/ncbi/enterococcus_hirae_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- enterococcus_metadata %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
enterococcus_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- enterococcus_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- enterococcus_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.6 Bacteroides uniformis
## BACTEROIDES UNIFORMIS
datasets summary genome taxon 820 \
--as-json-lines > b_uniformis_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' b_uniformis_genomes.jsonl > b_uniformis_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' b_uniformis_high_quality_genomes.jsonl > b_uniformis_biosample_attributes.tsvb_uniformis_metadata <- read_tsv("data/ncbi/bacteroides_uniformis_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- b_uniformis_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
b_uniformis_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- b_uniformis_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- b_uniformis_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.7 Phocaeicola vulgatus
## PHOCAEICOLA VULGATUS
datasets summary genome taxon 821 \
--as-json-lines > p_vulgatus_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' p_vulgatus_genomes.jsonl > p_vulgatus_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' p_vulgatus_high_quality_genomes.jsonl > p_vulgatus_biosample_attributes.tsvp_vulgatus_metadata <- read_tsv("data/ncbi/p_vulgatus_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- p_vulgatus_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
p_vulgatus_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- p_vulgatus_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- p_vulgatus_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.8 Parabacteroides goldsteinii
## PARABACTEROIDES GOLDSTEINII
datasets summary genome taxon 328812 \
--as-json-lines > p_goldsteinii_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' p_goldsteinii_genomes.jsonl > p_goldsteinii_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' p_goldsteinii_high_quality_genomes.jsonl > p_goldsteinii_biosample_attributes.tsvp_goldsteinii_metadata <- read_tsv("data/ncbi/parabacteroides_goldsteinii_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- p_goldsteinii_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
p_goldsteinii_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- p_goldsteinii_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- p_goldsteinii_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.9 Akkermansia municiphila
## AKKERMANSIA MUNICIPHILA
datasets summary genome taxon 239935 \
--as-json-lines > akkermansia_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' akkermansia_genomes.jsonl > akkermansia_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' akkermansia_high_quality_genomes.jsonl > akkermansia_biosample_attributes.tsvakkermansia_metadata <- read_tsv("data/ncbi/akkermansia_muciniphila_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- akkermansia_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
akkermansia_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- akkermansia_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- akkermansia_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.10 Bacteroides fragilis
## BACTEROIDES FRAGILIS
datasets summary genome taxon 817 \
--as-json-lines > b_fragilis_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' b_fragilis_genomes.jsonl > b_fragilis_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' b_fragilis_high_quality_genomes.jsonl > b_fragilis_biosample_attributes.tsvb_fragilis_metadata <- read_tsv("data/ncbi/bacteroides_fragilis_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- b_fragilis_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
b_fragilis_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- b_fragilis_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- b_fragilis_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)4.1.11 Providencia rettgeri
## PROVIDENCIA RETTGERI
datasets summary genome taxon 587 \
--as-json-lines > providencia_genomes.jsonl
#filtering by contamination and completeness
jq -c '
select(
.checkm_info.completeness >= 90 and
.checkm_info.contamination <= 2.5
)
' providencia_genomes.jsonl > providencia_high_quality_genomes.jsonl
#put metadata in tsv
jq -r '
.accession as $asm |
.assembly_info.biosample.attributes[]? |
[$asm, .name, .value] | @tsv
' providencia_high_quality_genomes.jsonl > providencia_biosample_attributes.tsvprovidencia_metadata <- read_tsv("data/ncbi/providencia_rettgeri_biosample_attributes.tsv", col_names = c("assembly", "attribute", "value"))
metadata_wide <- providencia_metadata %>%
filter(attribute != "assembly") %>%
pivot_wider(
id_cols = assembly,
names_from = attribute,
values_from = value,
values_fn = function(x) x[1]
)
providencia_gut <- metadata_wide %>%
filter(str_detect(isolation_source, regex(pattern, ignore_case = TRUE)))
## HUMAN
summary_counts <- providencia_gut %>%
dplyr::summarize(
total_rows = n(),
human_samples = sum(
host == "Homo sapiens" |
stringr::str_detect(isolation_source, stringr::regex("human", ignore_case = TRUE)),
na.rm = TRUE
)
)
print(summary_counts)
## OTHER HOSTS
other_hosts <- providencia_gut %>%
dplyr::filter(host != "Homo sapiens" | is.na(host)) %>%
dplyr::count(host, sort = TRUE)
print(other_hosts)species <- c(
"lactococcus_lactis",
"hafnia_paralvei",
"enterococcus_faecalis",
"enterococcus_hirae",
"bacteroides_uniformis",
"phocaeicola_vulgatus",
"parabacteroides_goldsteinii",
"akkermansia_muciniphila",
"bacteroides_fragilis",
"providencia_rettgeri",
"citrobacter_braakii"
)
for (sp in species) {
message("Processing ", sp)
infile <- file.path(
"data/mags_metadata",
paste0(sp, "_biosample_attributes.tsv")
)
metadata_long <- read_tsv(
infile,
col_names = c(
"accession",
"completeness",
"contamination",
"genome_size",
"gc_percent",
"name",
"value"
),
show_col_types = FALSE
)
metadata_wide <- metadata_long %>%
pivot_wider(
id_cols = c(
accession,
completeness,
contamination,
genome_size,
gc_percent
),
names_from = name,
values_from = value,
values_fn = ~ dplyr::first(na.omit(.x)),
names_repair = "unique"
)
## filter gut-associated genomes
if ("isolation_source" %in% colnames(metadata_wide)) {
gut_metadata <- metadata_wide %>%
filter(
str_detect(
isolation_source,
regex(pattern, ignore_case = TRUE)
)
)
} else {
warning(sp, " has no isolation_source column")
gut_metadata <- metadata_wide[0, ]
}
## save ONLY the gut-filtered metadata
saveRDS(
gut_metadata,
file.path(
"data/mags_metadata",
paste0(sp, "_ncbi_metadata.rds")
)
)
selected_accessions <- gut_metadata %>%
pull(accession) %>%
unique()
selected_accessions <- selected_accessions[!is.na(selected_accessions)]
writeLines(
selected_accessions,
file.path(
"data/mags_metadata",
paste0(sp, "_ncbi_selected_accessions.txt")
)
)
}4.2 Downloading the MAGs after filtering
Download the MAGs when you finish selecting them
### DOWNLOAD THE MAGS
datasets download genome accession --inputfile ../bacteroides_unifmormis_ncbi_selected_accessions.txt --include genome --filename hafnia_paralvei_selected_genomes.zip4.3 Data preparation
4.3.1 EHI MAGs
Rows: 8846 Columns: 52
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (36): ID, mag_name, link_to_assembly, AB_batch, eha_number, GTDB_version, GTDB_release, domain, phylum, ...
dbl (15): fastani_ani, closest_placement_ani, closest_placement_af, completeness, contamination, size, N50, ...
lgl (1): annotated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
4.3.2 Prepare color scheme
AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.
ehi_mags_p <- ehi_mags %>%
mutate(phylum=str_remove_all(phylum, "p__"))
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
mutate(phylum=str_remove_all(phylum, "p__")) %>%
right_join(ehi_mags_p, by=join_by(phylum == phylum)) %>%
dplyr::select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)source_colors <- c(EHI = "#8BC63F", GTDB = "#2D522D", NCBI= "#20558A")
host_type_colors <- c(human = "#A90D00" , animal = "darkgreen")
host_order_colors <- c(
"Accipitriformes" = "#143AC2", # Aves
"Carnivora" = "#633C2C", # Mammalia
"Chiroptera" = "#1B2021", # Mammalia
"Primates" = "#CC0000", # Mammalia
"Rodentia" = "#FDA991", # Mammalia
"Diptera" = "#FFAE2B", # Insecta
"Isoptera" = "#E27500", # Insecta
"Lepidoptera" = "#C8591D", # Insecta
"Squamata" = "#1D921B" # Reptilia
)4.3.3 Sample metadata
Rows: 129 Columns: 45
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (28): ID, source, species, gtdb_taxonomy, host_species, host_order, host_class, isolation_source, host, ...
dbl (8): completeness, contamination, genome_size, GC, N50, contigs, latitude, longitude
lgl (9): host_summary, gtdb_representative, mimag_high_quality, mimag_medium_quality, host_status, disease,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Helper across the pipeline
is_blank <- function(x) is.na(x) | x == ""
# 1) Recreate host_text (as you had)
genome_metadata <- genome_metadata %>%
mutate(
host_text = str_to_lower(
str_squish(
paste(
coalesce(host_species, ""),
coalesce(host, ""),
coalesce(isolation_source, ""),
coalesce(env_broad, ""),
coalesce(env_local_context, ""),
sep = " ; "
)
)
)
)
# 2) Infer host_species ONLY when missing/empty — now with your new species
genome_metadata <- genome_metadata %>%
mutate(
host_species = dplyr::case_when(
# keep explicit host_species if present
!is_blank(host_species) ~ host_species,
# Human
str_detect(host_text, "homo sapiens|\\bhuman\\b|patient|adult|child|infant") ~ "Homo sapiens",
# Mouse
str_detect(host_text, "mus musculus|\\bmouse\\b|\\bmice\\b|murine") ~ "Mus musculus",
# Rat
str_detect(host_text, "rattus|\\brat\\b") ~ "Rattus norvegicus",
# Pig / swine
str_detect(host_text, "sus scrofa|porcine|\\bpig\\b|swine") ~ "Sus scrofa",
# Cow / bovine
str_detect(host_text, "bos taurus|bovine|\\bcow\\b|cattle") ~ "Bos taurus",
# Dog / canine (normalize variants)
str_detect(host_text, "canis lupus familiaris|canis familiaris|\\bdog\\b|canine") ~ "Canis lupus familiaris",
# Chicken / poultry
str_detect(host_text, "gallus gallus|\\bchicken\\b|poultry") ~ "Gallus gallus",
# Silkworm
str_detect(host_text, "bombyx mori|silkworm") ~ "Bombyx mori",
# Drosophila / fruit fly
str_detect(host_text, "drosophila|fruit fly") ~ "Drosophila melanogaster",
# Termite
str_detect(host_text, "\\btermite\\b") ~ "Termite",
# Zebrafish
str_detect(host_text, "danio rerio|zebrafish") ~ "Danio rerio",
# Generic fish (also catch “fishbut”-style glued tokens)
str_detect(host_text, "\\bfish\\w*\\b") ~ "Actinopterygii (fish)",
# Bats and other taxa (from your counts)
str_detect(host_text, "barbastella barbastellus") ~ "Barbastella barbastellus",
str_detect(host_text, "glauconycteris\\s*sp\\.?") ~ "Glauconycteris sp",
str_detect(host_text, "hipposideros ruber") ~ "Hipposideros ruber",
str_detect(host_text, "rhinolophus capensis") ~ "Rhinolophus capensis",
# Vulture
str_detect(host_text, "cathartes aura") ~ "Cathartes aura",
# Grasshopper
str_detect(host_text, "\\bgrasshopper\\b") ~ "Grasshopper",
# New primate + fish species from your list
str_detect(host_text, "macaca fascicularis") ~ "Macaca fascicularis",
str_detect(host_text, "oncorhynchus mykiss") ~ "Oncorhynchus mykiss",
str_detect(host_text, "oreochromis niloticus") ~ "Oreochromis niloticus",
TRUE ~ host_species
)
)
# 3) Normalize existing host_species text so class/order rules match reliably
genome_metadata <- genome_metadata %>%
mutate(
hs_lower = str_to_lower(host_species),
host_species = dplyr::case_when(
# canonicalize common variants
str_detect(hs_lower, "^canis(\\s+lupus)?\\s+familiaris$") ~ "Canis lupus familiaris",
str_detect(hs_lower, "^drosophila melanogaster$") ~ "Drosophila melanogaster",
str_detect(hs_lower, "^bombyx mori$") ~ "Bombyx mori",
str_detect(hs_lower, "^danio rerio$") ~ "Danio rerio",
str_detect(hs_lower, "^homo sapiens$") ~ "Homo sapiens",
# catch glued fish tokens (e.g., "fishbut", "fishsomething")
str_detect(hs_lower, "^fish\\w*$") ~ "Actinopterygii (fish)",
TRUE ~ host_species
)
) %>%
dplyr::select(-hs_lower)
# 4) Fill host_class / host_order ONLY when missing — extended with your species
genome_metadata <- genome_metadata %>%
mutate(
# CLASS
host_class = dplyr::case_when(
!is_blank(host_class) ~ host_class,
str_detect(coalesce(host_species, ""), "Homo sapiens|Mus musculus|Rattus norvegicus|Sus scrofa|Bos taurus|Canis lupus familiaris|Barbastella barbastellus|Glauconycteris sp|Hipposideros ruber|Rhinolophus capensis|Macaca fascicularis") ~ "Mammalia",
str_detect(coalesce(host_species, ""), "Gallus gallus|Cathartes aura") ~ "Aves",
str_detect(coalesce(host_species, ""), "Drosophila melanogaster|Bombyx mori|Termite|Grasshopper") ~ "Insecta",
str_detect(coalesce(host_species, ""), "Danio rerio|Oncorhynchus mykiss|Oreochromis niloticus|Actinopterygii") ~ "Actinopterygii",
TRUE ~ host_class
),
# ORDER
host_order = dplyr::case_when(
!is_blank(host_order) ~ host_order,
# Mammals
str_detect(coalesce(host_species, ""), "Homo sapiens|Macaca fascicularis") ~ "Primates",
str_detect(coalesce(host_species, ""), "Mus musculus|Rattus norvegicus") ~ "Rodentia",
str_detect(coalesce(host_species, ""), "Sus scrofa|Bos taurus") ~ "Artiodactyla",
str_detect(coalesce(host_species, ""), "Canis lupus familiaris") ~ "Carnivora",
str_detect(coalesce(host_species, ""), "Barbastella barbastellus|Glauconycteris sp|Hipposideros ruber|Rhinolophus capensis") ~ "Chiroptera",
# Birds
str_detect(coalesce(host_species, ""), "Gallus gallus") ~ "Galliformes",
# Choose one scheme and be consistent; using Accipitriformes here
str_detect(coalesce(host_species, ""), "Cathartes aura") ~ "Accipitriformes",
# Insects
str_detect(coalesce(host_species, ""), "Drosophila melanogaster") ~ "Diptera",
str_detect(coalesce(host_species, ""), "Bombyx mori") ~ "Lepidoptera",
str_detect(coalesce(host_species, ""), "Termite") ~ "Isoptera",
str_detect(coalesce(host_species, ""), "Grasshopper") ~ "Orthoptera",
# Fish (species-level)
str_detect(coalesce(host_species, ""), "Danio rerio") ~ "Cypriniformes",
str_detect(coalesce(host_species, ""), "Oncorhynchus mykiss") ~ "Salmoniformes",
str_detect(coalesce(host_species, ""), "Oreochromis niloticus") ~ "Cichliformes",
# Generic fish remains NA (unknown order)
str_detect(coalesce(host_species, ""), "Actinopterygii \\(fish\\)") ~ NA_character_,
TRUE ~ host_order
)
) %>%
# Drop helper column
dplyr::select(-host_text)genome_metadata %>%
filter(is.na(host_class) | is.na(host_order) | host_class == "" | host_order == "") %>%
dplyr::count(host_species, sort = TRUE)# A tibble: 3 × 2
host_species n
<chr> <int>
1 Actinopterygii (fish) 3
2 Cabbage kimchi 1
3 <NA> 1
genome_metadata %>%
filter(host_species %in% c("Barbastella barbastellus", "Hipposideros ruber", "Glauconycteris sp",
"Cathartes aura", "Actinopterygii (fish)", "Grasshopper")) %>%
dplyr::count(host_species, host_class, host_order, sort = TRUE)# A tibble: 5 × 4
host_species host_class host_order n
<chr> <chr> <chr> <int>
1 Cathartes aura Aves Accipitriformes 4
2 Actinopterygii (fish) Actinopterygii <NA> 3
3 Hipposideros ruber Mammalia Chiroptera 2
4 Barbastella barbastellus Mammalia Chiroptera 1
5 Grasshopper Insecta Orthoptera 1
library(countrycode)
# 1) Simplify country strings like "Country: Region"
genome_metadata <- genome_metadata %>%
mutate(
country_simple = str_trim(str_split_fixed(coalesce(country, ""), ":", 2)[, 1]),
country_simple = na_if(country_simple, "")
)
# 2) Normalization map for common ENA/NCBI variants
.country_map <- c(
# US variants
"usa" = "United States",
"u\\.s\\." = "United States",
"u\\.s\\.a\\." = "United States",
"united states of america" = "United States",
# UK variants
"uk" = "United Kingdom",
"u\\.k\\." = "United Kingdom",
"great britain" = "United Kingdom",
"england" = "United Kingdom",
"scotland" = "United Kingdom",
"wales" = "United Kingdom",
"northern ireland" = "United Kingdom",
# Korea variants
"korea,? republic of" = "South Korea",
"republic of korea" = "South Korea",
"south korea" = "South Korea",
"korea,? \\(south\\)" = "South Korea",
# China variants
"pr china" = "China",
"p\\.r\\. china" = "China",
# Russia variants
"russian federation" = "Russia",
# Czech variants
"czech republic" = "Czechia",
# Eswatini / Swaziland; Côte d’Ivoire; DRC
"swaziland" = "Eswatini",
"cote d['’]ivoire" = "Côte d’Ivoire",
"democratic republic of the congo" = "DR Congo",
# Others commonly seen in ENA
"viet nam" = "Vietnam",
"myanmar \\(burma\\)" = "Myanmar",
"bolivia \\(plurinational state of\\)" = "Bolivia",
"iran,? islamic republic of" = "Iran",
"syrian arab republic" = "Syria",
"moldova,? republic of" = "Moldova",
"lao people'?s democratic republic" = "Laos",
"macedonia,? the former yugoslav republic of" = "North Macedonia",
"palestine,? state of" = "Palestine",
"hong kong" = "Hong Kong",
"macau|macao" = "Macau"
)
# 3) Apply normalization: lowercase → map → title‑case fallback
normalize_country <- function(x) {
y <- str_trim(tolower(coalesce(x, "")))
y[y == ""] <- NA_character_
# Apply regex replacements from the map (left to right)
for (pat in names(.country_map)) {
repl <- .country_map[[pat]]
y <- ifelse(!is.na(y), str_replace_all(y, paste0("^", pat, "$"), repl), y)
}
# General cleanups for accents/spacing variants
y <- ifelse(!is.na(y), str_replace_all(y, "\\s+", " "), y)
y <- ifelse(!is.na(y), str_replace_all(y, "^people's republic of china$", "China"), y)
# If still lowercased plain words (no mapping hit), title‑case them
# (won’t fix every case, but keeps things readable)
y <- ifelse(!is.na(y), str_to_title(y), y)
# Final tidy: replace common leftover patterns
y <- ifelse(!is.na(y), str_replace_all(y, " And ", " and "), y) # cosmetic
y
}
genome_metadata <- genome_metadata %>%
dplyr::mutate(
country_normalized = normalize_country(country_simple),
continent = countrycode::countrycode(
sourcevar = country_normalized,
origin = "country.name",
destination = "continent",
warn = TRUE
)
)Warning: There was 1 warning in `dplyr::mutate()`.
ℹ In argument: `continent = countrycode::countrycode(...)`.
Caused by warning:
! Some values were not matched unambiguously: None
genome_metadata <- genome_metadata %>%
# Normalize casing/whitespace for host_species
mutate(
host_species_norm = str_squish(host_species),
host_species_norm = if_else(
str_detect(str_to_lower(coalesce(host_species_norm, "")), "^homo\\s+sapiens$"),
"Homo sapiens",
# Title-case other species names to keep them tidy (optional)
str_to_title(host_species_norm)
)
) %>%
# Human vs animal classification
mutate(
host_type = case_when(
host_species_norm == "Homo sapiens" ~ "human",
!is.na(host_species_norm) & host_species_norm != "" ~ "animal",
TRUE ~ NA_character_
)
) %>%
# Keep your normalized species as the main column (optional)
mutate(host_species = host_species_norm) %>%
dplyr::select(-host_species_norm)# Read master index
master_index <- read_tsv("data/mags_metadata/master_mag_index_lactococcus_lactis.tsv") %>%
mutate(
# Extract the actual genome identifier from out_path
genome_identifier = case_when(
# For EHI: extract EHA00531_bin.1 from the filename
str_detect(out_path, "EHA") ~ str_extract(out_path, "EHA[0-9]+_bin\\.[0-9]+"),
# For NCBI: use the ID as is (GCA/GCF number)
TRUE ~ ID
)
)Rows: 116 Columns: 4
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): species, ID, MAG_url, out_path
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] "Master index mapping:"
# A tibble: 10 × 3
ID genome_identifier out_path
<chr> <chr> <chr>
1 EHM013277 EHA00364_bin.25 data/lactococcus_lactis/mags/EHA00364_bin.25.fa.gz
2 EHM015868 EHA00350_bin.27 data/lactococcus_lactis/mags/EHA00350_bin.27.fa.gz
3 EHM016594 EHA00567_bin.9 data/lactococcus_lactis/mags/EHA00567_bin.9.fa.gz
4 EHM017294 EHA00776_bin.9 data/lactococcus_lactis/mags/EHA00776_bin.9.fa.gz
5 EHM019886 EHA00211_bin.10 data/lactococcus_lactis/mags/EHA00211_bin.10.fa.gz
6 EHM020035 EHA00169_bin.40 data/lactococcus_lactis/mags/EHA00169_bin.40.fa.gz
7 EHM025059 EHA01453_bin.48 data/lactococcus_lactis/mags/EHA01453_bin.48.fa.gz
8 EHM025679 EHA01477_bin.63 data/lactococcus_lactis/mags/EHA01477_bin.63.fa.gz
9 EHM027656 EHA01634_bin.31 data/lactococcus_lactis/mags/EHA01634_bin.31.fa.gz
10 EHM028200 EHA01709_bin.8 data/lactococcus_lactis/mags/EHA01709_bin.8.fa.gz
# Read contig-to-genome mapping
contig_to_genome <- read_tsv("data/mags_metadata/lactococcus_lactis_contig_to_mag.tsv",
col_names = c("contig", "genome_filename"))Rows: 10206 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): contig, genome_filename
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
4.3.4 Genome annotations
# Read annotations and add IDs
genome_annotations <- read_tsv("data/mags_metadata/lactococcus_lactis_gene_annotations.tsv.xz") %>%
mutate(contig = sub("_[^_]*$", "", gene)) %>%
left_join(contig_to_genome, by = "contig") %>%
mutate(genome= genome_filename)%>%
filter(!is.na(genome))Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)
Rows: 365396 Columns: 17
── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (13): gene, strand, kegg, ec, pfam, cazy, resistance_type, resistance_target, vf, vf_type, signalp, defe...
dbl (2): start, end
lgl (2): antidefense, antidefense_type
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
4.3.4.1 REMOVE MAGs that do NOT belong to the species!! (specific lactococcus, rest should be fine)
# Cleaner to normalize identifiers
clean_label <- function(x) {
x <- basename(x)
sub("\\.(fna|fa|fasta)(\\.gz)?$", "", x, ignore.case = TRUE)
}
# List to remove (MAGs that belong to other species <95% ANI)
mags_to_remove <- c(
"GCA_018369575.1.fna","GCA_947063445.1.fna","GCA_947101685.1.fna","GCA_948698275.1.fna",
"GCA_937910935.1.fna","GCA_947072755.1.fna","GCA_948655095.1.fna","GCA_948703095.1.fna",
"GCA_947041925.1.fna","GCA_947073355.1.fna","GCA_948675165.1.fna","GCA_948718815.1.fna"
)
mags_to_remove_clean <- clean_label(mags_to_remove)
# ---------- genome_metadata: remove + dedup (keep best per ID) ----------
stopifnot("ID" %in% names(genome_metadata))
gm0 <- genome_metadata %>%
mutate(ID_clean = clean_label(ID))
n_before_meta <- nrow(gm0)
n_to_remove_meta <- sum(gm0$ID_clean %in% mags_to_remove_clean)
genome_metadata_filtered <- gm0 %>%
filter(!ID_clean %in% mags_to_remove_clean)
has_comp <- "completeness" %in% names(genome_metadata_filtered)
has_cont <- "contamination" %in% names(genome_metadata_filtered)
has_contigs <- "contigs" %in% names(genome_metadata_filtered)
if (has_comp || has_cont || has_contigs) {
genome_metadata_dedup <- genome_metadata_filtered %>%
mutate(
completeness = if (has_comp) as.numeric(completeness) else NA_real_,
contamination = if (has_cont) as.numeric(contamination) else NA_real_,
contigs = if (has_contigs) as.numeric(contigs) else NA_real_
) %>%
arrange(
ID_clean,
desc(if (has_comp) completeness else 0),
(if (has_cont) contamination else 0),
(if (has_contigs) contigs else Inf)
) %>%
distinct(ID_clean, .keep_all = TRUE)
} else {
genome_metadata_dedup <- genome_metadata_filtered %>%
distinct(ID_clean, .keep_all = TRUE)
}
genome_metadata_dedup <- genome_metadata_dedup %>%
select(-ID_clean)
# ---------- genome_annotations: ONLY filter (NO dedup) ----------
stopifnot("genome" %in% names(genome_annotations))
ga0 <- genome_annotations %>%
mutate(genome_clean = clean_label(genome))
n_before_ann <- nrow(ga0)
n_to_remove_ann <- sum(ga0$genome_clean %in% mags_to_remove_clean)
# Remove problematic genomes; keep all remaining genes
genome_annotations_filtered <- ga0 %>%
filter(!genome_clean %in% mags_to_remove_clean) %>%
select(-genome_clean)
# ---------- report ----------
cat("genome_metadata:\n")genome_metadata:
- rows before: 129
- removed by species list: 0
- rows after remove: 129
- rows after dedup: 129
genome_annotations:
- rows before: 335828
- removed by species list: 0
- rows after remove: 335828
4.3.5 Distill annotations into GIFTs
genome_gifts <- distill(genome_annotations,GIFT_db,genomecol= 19, annotcol=c(5,6,7,8), verbosity = F)
Identifiers in the annotation table: 1293
Identifiers in the database: 1547
Identifiers in both: 148
Percentage of annotation table identifiers used for distillation: 11.45%
Percentage of database identifiers used for distillation: 9.57%