Chapter 7 Functional Analysis
7.1 Functional overview
# A tibble: 6 × 2
genome n_genes
<chr> <int>
1 EHM013277 2223
2 EHM015868 2322
3 EHM016594 2131
4 EHM017294 2516
5 EHM019886 2410
6 EHM020035 2296
7.1.0.2 Number of annotated genes and percentages
#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
nrow()
cat(genome_annota)281627
[1] 83.86049
7.1.0.3 Number of KEGG annotatated genes and percentages
# KEGG annotation
kegg_annota <- genome_annotations %>%
filter(!is.na(kegg)) %>%
nrow()
cat(kegg_annota)235205
[1] 83.5165
# AMR annotation
amr_annota <- genome_annotations %>%
filter(resistance_type == "AMR") %>%
nrow()
cat(amr_annota)24377
[1] 8.655775
62759
[1] 22.28444
n_pred_genes <- genome_annotations %>%
group_by(genome) %>%
summarize(n_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, host_type),
by = c("genome" = "ID")
)
annotated_genes <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
group_by(genome) %>%
summarize(n_annotated_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, host_type),
by = c("genome" = "ID")
)ggplot(n_pred_genes, aes(x= host_type, y = n_genes, fill = host_type))+
scale_fill_manual(values = host_type_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Gene Number", x = "Host Type")ggplot(annotated_genes, aes(x= host_type, y = n_annotated_genes, fill = host_type))+
scale_fill_manual(values = host_type_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Annotated Gene Number", x = "Host Type")Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot compute exact p-value with ties
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 111 0.156 Wilcoxon rank sum test with continuity correction two.sided
Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot compute exact p-value with ties
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 97 0.0720 Wilcoxon rank sum test with continuity correction two.sided
7.2 KEGG
7.2.1 Nº of MAGs with KOs
#Proportion of MAGs belonging to each group per KEGG
#I want to know for example 5% of MAGs from anials have this KEGG, but 40 % of MAGs from humans have it
# KEGG presence/absence
kegg_presence <- genome_annotations %>%
filter(!is.na(kegg)) %>%
dplyr::select(genome, kegg) %>%
distinct()
#Add the host_type info
kegg_with_host_type <- kegg_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, host_type),
by = c("genome" = "ID")
)
# Count how many MAGs in each KEGG
kegg_mag_counts <- kegg_with_host_type %>%
group_by(host_type, kegg) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per host_type (except the outlier)
total_mags_per_host_type <- genome_metadata %>%
group_by(host_type) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each host_type in each KEGG
kegg_mag_proportions <- kegg_mag_counts %>%
left_join(total_mags_per_host_type, by = "host_type") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)7.2.1.1 Plotting KEGG MAG proportions
# A tibble: 2,692 × 6
host_type kegg n_mags total_mags proportion absent
<chr> <chr> <int> <int> <dbl> <int>
1 animal K00003 40 84 0.476 44
2 animal K00004 38 84 0.452 46
3 animal K00009 39 84 0.464 45
4 animal K00013 41 84 0.488 43
5 animal K00014 41 84 0.488 43
6 animal K00016 41 84 0.488 43
7 animal K00020 41 84 0.488 43
8 animal K00030 41 84 0.488 43
9 animal K00033 41 84 0.488 43
10 animal K00036 41 84 0.488 43
# ℹ 2,682 more rows
7.2.1.2 Statistical testing of MAG proportions
fisher_results <- kegg_mag_proportions %>%
dplyr::select(kegg, host_type, n_mags, absent) %>%
pivot_wider(
names_from = host_type,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_animal, absent_animal,
n_mags_human, absent_human),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_animal = n_mags_animal / (n_mags_animal + absent_animal),
prop_human = n_mags_human / (n_mags_human + absent_human),
diff_prop = prop_human - prop_animal
)volcano_df <- fisher_results %>%
# Keep rows with non-missing p_adj and diff_prop
filter(!is.na(p_adj), !is.na(diff_prop)) %>%
# Avoid Inf y-values if p_adj == 0 by flooring at 1e-300 (or your choice)
mutate(
p_adj_capped = pmax(p_adj, 1e-300),
nl10 = -log10(p_adj_capped),
sig = p_adj < 0.05
)
to_label <- volcano_df %>%
filter(sig) %>%
arrange(desc(nl10)) %>%
dplyr::slice(1:10)
ggplot(volcano_df, aes(x = diff_prop, y = nl10)) +
geom_point(aes(color = sig), alpha = 0.8, size = 2) +
geom_text_repel(
data = to_label,
aes(label = kegg), # or `Name`
max.overlaps = 20,
size = 3,
segment.color = "grey70"
) +
scale_color_manual(values = c(`TRUE` = "#1f77b4", `FALSE` = "grey70"), name ="p_adj < 0.05") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
theme_minimal() +
labs(
x = "Effect size (diff_prop)",
y = expression(-log[10](p[adj])),
title = "Volcano plot: differential KEGG presence"
)# Ensure host_type has the intended order
genome_metadata <- genome_metadata %>%
mutate(host_type = factor(host_type, levels = c("animal", "human")))
# 1) KEGG presence (unique genome–kegg)
kegg_presence <- genome_annotations %>%
dplyr::select(genome, kegg) %>%
distinct() %>%
filter(!is.na(kegg))
# 2) Add host type
kegg_with_host <- kegg_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, host_type),
by = c("genome" = "ID")
)
# 3) Count present per (kegg, host_type)
kegg_counts <- kegg_with_host %>%
group_by(kegg, host_type) %>%
summarise(n_present = n_distinct(genome), .groups = "drop")
# 4) Totals per host_type
totals <- genome_metadata %>%
group_by(host_type) %>%
summarise(total_mags = n_distinct(ID), .groups = "drop")
# 5) Complete grid and compute absent correctly
kegg_counts_complete <- kegg_counts %>%
tidyr::complete(kegg, host_type, fill = list(n_present = 0)) %>%
left_join(totals, by = "host_type") %>%
mutate(n_absent = total_mags - n_present)
# 6) Pivot to 2×2 format
wide <- kegg_counts_complete %>%
dplyr::select(kegg, host_type, n_present, n_absent) %>%
pivot_wider(
id_cols = kegg,
names_from = host_type,
values_from = c(n_present, n_absent),
values_fill = 0
)
# Sanity: ensure expected columns exist (adjust names if your levels differ)
stopifnot(all(c("n_present_animal","n_absent_animal","n_present_human","n_absent_human") %in% names(wide)))
wide <- wide %>%
mutate(
n_animal = n_present_animal,
a_animal = n_absent_animal,
n_human = n_present_human,
a_human = n_absent_human
)
# 7) SAFE P-VALUE: robust computation avoiding FEXACT 40
safe_fisher_p <- function(n_animal, a_animal, n_human, a_human) {
m <- matrix(c(n_animal, a_animal, n_human, a_human), nrow = 2, byrow = TRUE)
# If any row or column sums to zero, Fisher is fine (returns 1) or undefined;
# handle degenerate cases: if all zero or all present, return p=1
if (any(rowSums(m) == 0) || any(colSums(m) == 0)) {
return(1.0)
}
# 1) Try exact Fisher with large workspace
p1 <- tryCatch(
fisher.test(m, workspace = 2e8)$p.value,
error = function(e) NA_real_
)
if (!is.na(p1)) return(p1)
# 2) Fall back to simulated Fisher
p2 <- tryCatch(
fisher.test(m, simulate.p.value = TRUE, B = 1e6)$p.value,
error = function(e) NA_real_
)
if (!is.na(p2)) return(p2)
# 3) Last resort: chi-square with Yates correction
p3 <- tryCatch(
suppressWarnings(chisq.test(m, correct = TRUE)$p.value),
error = function(e) NA_real_
)
if (!is.na(p3)) return(p3)
# If everything fails, return NA
return(NA_real_)
}
# 8) Compute stats rowwise with robust p-values
fisher_df <- wide %>%
rowwise() %>%
mutate(
# p-value
p_value = safe_fisher_p(n_animal, a_animal, n_human, a_human),
# proportions and difference
prop_animal = ifelse((n_animal + a_animal) > 0, n_animal / (n_animal + a_animal), NA_real_),
prop_human = ifelse((n_human + a_human) > 0, n_human / (n_human + a_human), NA_real_),
diff_prop = prop_human - prop_animal,
# Haldane–Anscombe for OR if any zero cell (effect size only)
needs_ha = (n_animal == 0L) | (a_animal == 0L) | (n_human == 0L) | (a_human == 0L),
n_animal_ha = ifelse(needs_ha, n_animal + 0.5, n_animal),
a_animal_ha = ifelse(needs_ha, a_animal + 0.5, a_animal),
n_human_ha = ifelse(needs_ha, n_human + 0.5, n_human),
a_human_ha = ifelse(needs_ha, a_human + 0.5, a_human),
or = (n_human_ha * a_animal_ha) / (a_human_ha * n_animal_ha),
log2_or = log2(or),
# Relative risk with light Laplace smoothing (fold-change of proportions)
rr = ((n_human + 0.5) / (n_human + a_human + 1)) /
((n_animal + 0.5) / (n_animal + a_animal + 1)),
log2_rr = log2(rr)
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
# 9) Optional prevalence filter to stabilize estimates
fisher_df <- fisher_df %>%
mutate(total_present = n_animal + n_human) %>%
filter(
total_present >= 5 |
prop_animal >= 0.03 | prop_human >= 0.03
)
# 10) Volcano (log2 OR)
volcano_df <- fisher_df %>%
filter(!is.na(p_adj), !is.na(log2_or)) %>%
mutate(
p_adj_capped = pmax(p_adj, 1e-300),
nl10 = -log10(p_adj_capped),
sig = p_adj < 0.05
)
to_label <- volcano_df %>%
filter(sig) %>%
arrange(desc(nl10)) %>%
dplyr::slice(1:10)
ggplot(volcano_df, aes(x = log2_or, y = nl10)) +
geom_point(aes(color = sig), alpha = 0.8, size = 2) +
geom_text_repel(
data = to_label,
aes(label = kegg),
max.overlaps = 20,
size = 3,
segment.color = "grey70"
) +
scale_color_manual(values = c(`TRUE` = "#1f77b4", `FALSE` = "grey70"), name = "FDR < 0.05") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
geom_vline(xintercept = c(-1, 1), linetype = "dotted", color = "grey60") + # OR ~0.5 and ~2
theme_minimal() +
labs(
x = "Effect size (log2 odds ratio, human vs animal)",
y = expression(-log[10](FDR)),
title = "Volcano plot: differential KEGG presence (human vs animal)"
)# 11) Top 20 by significance then effect size
top20 <- fisher_df %>%
arrange(p_adj, desc(abs(log2_or))) %>%
dplyr::select(
kegg,
n_human, a_human, prop_human,
n_animal, a_animal, prop_animal,
diff_prop, log2_rr, log2_or, p_value, p_adj
) %>%
dplyr::slice(1:20)
top20# A tibble: 20 × 12
kegg n_human a_human prop_human n_animal a_animal prop_animal diff_prop log2_rr log2_or p_value p_adj
<chr> <int> <int> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 K12268 0 42 0 19 65 0.226 -0.226 -4.30 -4.66 0.000323 0.00356
2 K12270 0 42 0 19 65 0.226 -0.226 -4.30 -4.66 0.000323 0.00356
3 K18967 1 41 0.0238 20 64 0.238 -0.214 -2.79 -3.68 0.00179 0.00356
4 K20485 3 39 0.0714 33 51 0.393 -0.321 -2.28 -3.07 0.000119 0.00356
5 K07114 4 38 0.0952 39 45 0.464 -0.369 -2.15 -3.04 0.0000222 0.00356
6 K10242 3 39 0.0714 30 54 0.357 -0.286 -2.14 -2.85 0.000467 0.00356
7 K13789 4 38 0.0952 34 50 0.405 -0.310 -1.96 -2.69 0.000363 0.00356
8 K06075 5 37 0.119 39 45 0.464 -0.345 -1.86 -2.68 0.000127 0.00356
9 K13730 4 38 0.0952 33 51 0.393 -0.298 -1.91 -2.62 0.000398 0.00356
10 K07001 5 37 0.119 38 46 0.452 -0.333 -1.82 -2.61 0.000145 0.00356
11 K01235 4 38 0.0952 32 52 0.381 -0.286 -1.87 -2.55 0.000712 0.00356
12 K08217 4 38 0.0952 32 52 0.381 -0.286 -1.87 -2.55 0.000712 0.00356
13 K15531 4 38 0.0952 32 52 0.381 -0.286 -1.87 -2.55 0.000712 0.00356
14 K01854 5 37 0.119 36 48 0.429 -0.310 -1.75 -2.47 0.000502 0.00356
15 K09155 5 37 0.119 36 48 0.429 -0.310 -1.75 -2.47 0.000502 0.00356
16 K16028 5 37 0.119 36 48 0.429 -0.310 -1.75 -2.47 0.000502 0.00356
17 K07005 6 36 0.143 40 44 0.476 -0.333 -1.66 -2.45 0.000196 0.00356
18 K21739 6 36 0.143 40 44 0.476 -0.333 -1.66 -2.45 0.000196 0.00356
19 K22980 6 36 0.143 39 45 0.464 -0.321 -1.62 -2.38 0.000351 0.00356
20 K20374 5 37 0.119 34 50 0.405 -0.286 -1.67 -2.33 0.000993 0.00356
library(KEGGREST)
# Function to get info from KEGG
get_kegg_info <- function(ko_id) {
query <- keggGet(ko_id)[[1]]
# Extract Name and Definition
name <- ifelse(!is.null(query$NAME), query$NAME, "N/A")
definition <- ifelse(!is.null(query$DEFINITION), query$DEFINITION, "N/A")
# Extract Pathways (often multiple)
pathways <- if(!is.null(query$PATHWAY)) {
paste(names(query$PATHWAY), query$PATHWAY, sep = ": ", collapse = "; ")
} else {
"No pathway assigned"
}
return(c(KO = ko_id, Name = name, Definition = definition, Pathways = pathways))
}ko_list <- lapply(top20$kegg, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_list))
print(ko_summary) KO Name Definition
1 K12268 accessory secretory protein Asp1 N/A
2 K12270 accessory secretory protein Asp3 N/A
3 K18967 diguanylate cyclase [EC:2.7.7.65] N/A
4 K20485 ATP-binding cassette, subfamily B, bacterial NisT/SpaT N/A
5 K07114 Ca-activated chloride channel homolog N/A
6 K10242 cellobiose transport system permease protein N/A
7 K13789 geranylgeranyl diphosphate synthase, type II [EC:2.5.1.1 2.5.1.10 2.5.1.29] N/A
8 K06075 MarR family transcriptional regulator, transcriptional regulator for hemolysin N/A
9 K13730 internalin A N/A
10 K07001 NTE family protein N/A
11 K01235 alpha-glucuronidase [EC:3.2.1.139] N/A
12 K08217 MFS transporter, DHA3 family, macrolide efflux protein N/A
13 K15531 oligosaccharide reducing-end xylanase [EC:3.2.1.156] N/A
14 K01854 UDP-galactopyranose mutase [EC:5.4.99.9] N/A
15 K09155 uncharacterized protein N/A
16 K16028 O-methyltransferase N/A
17 K07005 uncharacterized protein N/A
18 K21739 probable pyridine nucleotide-disulfide oxidoreductase N/A
19 K22980 capsule synthesis positive regulator AcpB N/A
20 K20374 HTH-type transcriptional regulator, SHP3-responsive repressor N/A
Pathways
1 No pathway assigned
2 No pathway assigned
3 No pathway assigned
4 map02020: Two-component system; map02024: Quorum sensing
5 No pathway assigned
6 map02010: ABC transporters
7 map00900: Terpenoid backbone biosynthesis; map01100: Metabolic pathways; map01110: Biosynthesis of secondary metabolites
8 No pathway assigned
9 map05100: Bacterial invasion of epithelial cells
10 No pathway assigned
11 No pathway assigned
12 No pathway assigned
13 No pathway assigned
14 map00052: Galactose metabolism; map00520: Amino sugar and nucleotide sugar metabolism; map01100: Metabolic pathways; map01250: Biosynthesis of nucleotide sugars
15 No pathway assigned
16 map01051: Biosynthesis of ansamycins; map01052: Type I polyketide structures
17 No pathway assigned
18 No pathway assigned
19 No pathway assigned
20 map02024: Quorum sensing
fish_results_names <- left_join(top20, ko_summary, join_by(kegg== KO))
heatmap_data <- fish_results_names %>%
dplyr::select(Name, prop_animal, prop_human)%>%
pivot_longer(!Name, names_to = "host_type", values_to= "proportion" ) %>%
mutate(
host_type = case_when(
host_type == "prop_animal" ~ "animal",
host_type == "prop_human" ~ "human",
TRUE ~ host_type
)
)# 1) Reduce to one row per Name × host_type
heatmap_summary <- heatmap_data %>%
group_by(Name, host_type) %>%
summarise(
proportion = mean(proportion, na.rm = TRUE), # or median(), max(), first()
.groups = "drop"
)
# 2) Compute diff = animal - human in wide form
heatmap_wide <- heatmap_summary %>%
filter(host_type %in% c("animal", "human")) %>% # just in case there are other types
tidyr::pivot_wider(
names_from = host_type,
values_from = proportion
) %>%
mutate(
diff = animal - human
)
# 3) Reorder Name by diff and bring back to long for plotting
heatmap_plot <- heatmap_wide %>%
mutate(Name = forcats::fct_reorder(Name, diff)) %>%
tidyr::pivot_longer(
cols = c(animal, human),
names_to = "host_type",
values_to = "proportion"
)
# 4) Plot — heatmap
ggplot(heatmap_plot,
aes(x = host_type, y = Name, fill = proportion)) +
geom_tile() +
geom_text(aes(label = scales::number(proportion, accuracy = 0.01)),
size = 6) +
scale_fill_viridis_c(
name = "Proportion of MAGs",
limits = c(0, 1)
) +
theme_minimal() +
labs(
x = "host_type",
y = "KEGG ortholog",
title = "Differential KEGG presence between animal and human"
)table_data <- fish_results_names %>%
transmute(
`KEGG ortholog` = Name,
`Proportion in animal` = prop_animal,
`Proportion in human` = prop_human,
`Δ (human - animal)` = prop_human - prop_animal ,
`Log2(Odds Ratio)` = log2_or ,
`Fisher p-value` = p_adj
) %>%
arrange(desc(abs(`Δ (human - animal)`)))
table_data# A tibble: 20 × 6
`KEGG ortholog` `Proportion in animal` `Proportion in human` `Δ (human - animal)` `Log2(Odds Ratio)`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Ca-activated chloride c… 0.464 0.0952 -0.369 -3.04
2 MarR family transcripti… 0.464 0.119 -0.345 -2.68
3 NTE family protein 0.452 0.119 -0.333 -2.61
4 uncharacterized protein 0.476 0.143 -0.333 -2.45
5 probable pyridine nucle… 0.476 0.143 -0.333 -2.45
6 capsule synthesis posit… 0.464 0.143 -0.321 -2.38
7 ATP-binding cassette, s… 0.393 0.0714 -0.321 -3.07
8 geranylgeranyl diphosph… 0.405 0.0952 -0.310 -2.69
9 UDP-galactopyranose mut… 0.429 0.119 -0.310 -2.47
10 uncharacterized protein 0.429 0.119 -0.310 -2.47
11 O-methyltransferase 0.429 0.119 -0.310 -2.47
12 internalin A 0.393 0.0952 -0.298 -2.62
13 cellobiose transport sy… 0.357 0.0714 -0.286 -2.85
14 alpha-glucuronidase [EC… 0.381 0.0952 -0.286 -2.55
15 MFS transporter, DHA3 f… 0.381 0.0952 -0.286 -2.55
16 oligosaccharide reducin… 0.381 0.0952 -0.286 -2.55
17 HTH-type transcriptiona… 0.405 0.119 -0.286 -2.33
18 accessory secretory pro… 0.226 0 -0.226 -4.66
19 accessory secretory pro… 0.226 0 -0.226 -4.66
20 diguanylate cyclase [EC… 0.238 0.0238 -0.214 -3.68
# ℹ 1 more variable: `Fisher p-value` <dbl>
Warning: package 'gt' was built under R version 4.4.3
Adjuntando el paquete: 'gt'
The following object is masked from 'package:ape':
where
table_data %>%
gt() %>%
fmt_number(
columns = c(`Proportion in animal`, `Proportion in human`, `Δ (human - animal)`, `Log2(Odds Ratio)` ),
decimals = 2
) %>%
fmt_scientific(
columns = `Fisher p-value`,
decimals = 2
) %>%
tab_header(
title = "KEGG orthologs differing between animal and human MAGs"
) %>%
cols_align(
align = "center",
everything()
)%>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(everything())
)| KEGG orthologs differing between animal and human MAGs | |||||
| KEGG ortholog | Proportion in animal | Proportion in human | Δ (human - animal) | Log2(Odds Ratio) | Fisher p-value |
|---|---|---|---|---|---|
| Ca-activated chloride channel homolog | 0.46 | 0.10 | −0.37 | −3.04 | 3.56 × 10−3 |
| MarR family transcriptional regulator, transcriptional regulator for hemolysin | 0.46 | 0.12 | −0.35 | −2.68 | 3.56 × 10−3 |
| NTE family protein | 0.45 | 0.12 | −0.33 | −2.61 | 3.56 × 10−3 |
| uncharacterized protein | 0.48 | 0.14 | −0.33 | −2.45 | 3.56 × 10−3 |
| probable pyridine nucleotide-disulfide oxidoreductase | 0.48 | 0.14 | −0.33 | −2.45 | 3.56 × 10−3 |
| capsule synthesis positive regulator AcpB | 0.46 | 0.14 | −0.32 | −2.38 | 3.56 × 10−3 |
| ATP-binding cassette, subfamily B, bacterial NisT/SpaT | 0.39 | 0.07 | −0.32 | −3.07 | 3.56 × 10−3 |
| geranylgeranyl diphosphate synthase, type II [EC:2.5.1.1 2.5.1.10 2.5.1.29] | 0.40 | 0.10 | −0.31 | −2.69 | 3.56 × 10−3 |
| UDP-galactopyranose mutase [EC:5.4.99.9] | 0.43 | 0.12 | −0.31 | −2.47 | 3.56 × 10−3 |
| uncharacterized protein | 0.43 | 0.12 | −0.31 | −2.47 | 3.56 × 10−3 |
| O-methyltransferase | 0.43 | 0.12 | −0.31 | −2.47 | 3.56 × 10−3 |
| internalin A | 0.39 | 0.10 | −0.30 | −2.62 | 3.56 × 10−3 |
| cellobiose transport system permease protein | 0.36 | 0.07 | −0.29 | −2.85 | 3.56 × 10−3 |
| alpha-glucuronidase [EC:3.2.1.139] | 0.38 | 0.10 | −0.29 | −2.55 | 3.56 × 10−3 |
| MFS transporter, DHA3 family, macrolide efflux protein | 0.38 | 0.10 | −0.29 | −2.55 | 3.56 × 10−3 |
| oligosaccharide reducing-end xylanase [EC:3.2.1.156] | 0.38 | 0.10 | −0.29 | −2.55 | 3.56 × 10−3 |
| HTH-type transcriptional regulator, SHP3-responsive repressor | 0.40 | 0.12 | −0.29 | −2.33 | 3.56 × 10−3 |
| accessory secretory protein Asp1 | 0.23 | 0.00 | −0.23 | −4.66 | 3.56 × 10−3 |
| accessory secretory protein Asp3 | 0.23 | 0.00 | −0.23 | −4.66 | 3.56 × 10−3 |
| diguanylate cyclase [EC:2.7.7.65] | 0.24 | 0.02 | −0.21 | −3.68 | 3.56 × 10−3 |
7.2.2 Functional Ordination
PCA of KEGG annotations:
kegg_counts <- genome_annotations %>%
filter(!is.na(kegg)) %>%
dplyr::count(genome, kegg) %>%
pivot_wider(
names_from = kegg,
values_from = n,
values_fill = 0
)
# Normalization
kegg_rel <- kegg_counts %>%
column_to_rownames("genome")
kegg_rel <- kegg_rel / rowSums(kegg_rel) # Each row sums to 1
# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]
# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)
# Check variance explained
summary(pca)$importance[,1:5] PC1 PC2 PC3 PC4 PC5
Standard deviation 19.10047 11.37402 8.655536 7.950973 7.575608
Proportion of Variance 0.16977 0.06020 0.034860 0.029420 0.026710
Cumulative Proportion 0.16977 0.22997 0.264830 0.294250 0.320950
country_palette <- c(
# Southern Europe
"Spain" = "#1B9E77",
"Italy" = "#33A02C",
"Greece" = "#66C2A5",
"Portugal" = "#2CA25F",
"Malta" = "#99D8C9",
# Northern/Central Europe
"Germany" = "#1F78B4",
"United Kingdom" = "#4A90E2",
"Ireland" = "#6BAED6",
# East Asia
"Japan" = "#E31A1C",
"South Korea" = "#FB6A4A",
"China" = "#CB181D",
# North America
"USA" = "#756BB1",
"Canada" = "#9E9AC8",
# Distinct
"Australia" = "#FFD92F",
"Greenland" = "#A6CEE3",
"none" = "grey70"
)scores <- as.data.frame(pca$x)
scores$ID <- rownames(scores)
scores <- scores %>%
left_join(genome_metadata, by = "ID")
ggplot(scores, aes(PC1, PC2, color = host_type, label = ID)) +
geom_point(size = 2) +
scale_color_manual(values = host_type_colors)+
theme_minimal() +
labs(
title = "PCA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
) + geom_text_repel()Warning: ggrepel: 139 unlabeled data points (too many overlaps). Consider increasing max.overlaps
##PCA again without outlier
kegg_counts <- genome_annotations %>%
filter(genome != "EHM071389") %>%
filter(!is.na(kegg)) %>%
dplyr::count(genome, kegg) %>%
pivot_wider(
names_from = kegg,
values_from = n,
values_fill = 0
)
# Normalization
kegg_rel <- kegg_counts %>%
column_to_rownames("genome")
kegg_rel <- kegg_rel / rowSums(kegg_rel) # Each row sums to 1
# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]
# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)
# Check variance explained
summary(pca)$importance[,1:5] PC1 PC2 PC3 PC4 PC5
Standard deviation 19.54513 8.762091 8.060842 7.660639 7.307655
Proportion of Variance 0.18384 0.036950 0.031270 0.028240 0.025700
Cumulative Proportion 0.18384 0.220780 0.252050 0.280290 0.305990
scores_filtered <- scores %>% filter(ID != "EHM071389")
ggplot(scores_filtered, aes(PC1, PC2, color = host_type)) +
geom_point(size = 2) +
scale_color_manual(values = host_type_colors)+
theme_minimal() +
labs(
title = "PCA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
) ggplot(scores_filtered, aes(PC2, PC3, color = host_type)) +
scale_color_manual(values = host_type_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P2 vs PC3 of KEGG annotations across MAGs",
x = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)"),
y = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC3, PC4, color = host_type)) +
scale_color_manual(values = host_type_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P3 vs PC4 of KEGG annotations across MAGs",
x = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)"),
y = paste0("PC4 (", round(summary(pca)$importance[2,4] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by genome size",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by completeness",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC1, PC2, color = host_order)) +
scale_color_manual(values = host_order_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by host order ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC1, PC2, color = country_normalized)) +
geom_point(size = 2) +
theme_minimal() +
scale_color_manual(values = country_palette, na.value = "grey70")+
labs(
title = "KEGG PCA colored by country ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores_filtered, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by continent",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)loadings <- pca$rotation
abs_loadings <- apply(abs(loadings[, 1:2]), 1, sum)
top_combined <- sort(abs_loadings, decreasing = TRUE)[1:10]
top_combined K01811 K21712 K20485 K25670 K00390 K02049 K15269 K06123 K21713 K25089
0.09747308 0.09689785 0.09352428 0.09181942 0.09112710 0.09036799 0.09021472 0.09020442 0.08818284 0.08645722
library(KEGGREST)
top_loadings <- loadings %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
arrange(desc(abs(PC1)))
head(top_loadings) KEGG PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
1 K00079 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
2 K00036 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
3 K00133 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
4 K00215 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
5 K00243 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
6 K00567 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
1 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
2 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
3 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
4 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
5 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
6 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24
1 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
2 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
3 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
4 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
5 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
6 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32
1 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
2 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
3 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
4 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
5 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
6 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
PC33 PC34 PC35 PC36 PC37 PC38 PC39 PC40
1 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
2 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
3 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
4 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
5 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
6 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
PC41 PC42 PC43 PC44 PC45 PC46 PC47 PC48
1 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
2 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
3 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
4 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
5 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
6 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
PC49 PC50 PC51 PC52 PC53 PC54 PC55 PC56
1 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
2 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
3 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
4 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
5 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
6 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
PC57 PC58 PC59 PC60 PC61 PC62 PC63 PC64
1 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
2 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
3 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
4 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
5 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
6 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
PC65 PC66 PC67 PC68 PC69 PC70 PC71 PC72
1 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
2 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
3 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
4 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
5 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
6 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
PC73 PC74 PC75 PC76 PC77 PC78 PC79 PC80
1 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
2 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
3 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
4 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
5 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
6 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
PC81 PC82 PC83 PC84 PC85 PC86 PC87 PC88
1 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
2 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
3 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
4 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
5 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
6 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
PC89 PC90 PC91 PC92 PC93 PC94 PC95 PC96
1 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
2 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
3 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
4 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
5 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
6 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
PC97 PC98 PC99 PC100 PC101 PC102 PC103 PC104
1 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
2 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
3 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
4 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
5 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
6 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
PC105 PC106 PC107 PC108 PC109 PC110 PC111 PC112
1 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
2 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
3 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
4 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
5 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
6 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
PC113 PC114 PC115 PC116 PC117 PC118 PC119 PC120
1 1.899141e-05 -0.0008511757 0.06010329 -0.002962319 0.016043795 -8.409942e-03 0.0037907507 0.000645923
2 1.899141e-05 -0.0008511757 -0.20640265 0.020078124 -0.015956337 -1.380522e-03 0.0065767892 -0.005709369
3 1.899141e-05 -0.0008511757 -0.07341287 0.015173034 -0.005053505 -3.825393e-04 0.0034825175 0.007271212
4 1.899141e-05 -0.0008511757 -0.20753487 0.022067260 -0.006439269 -2.867193e-03 0.0051834304 0.008300492
5 1.899141e-05 -0.0008511757 0.02580746 -0.009882745 0.006425397 -7.474454e-03 0.0025719783 -0.002264049
6 1.899141e-05 -0.0008511757 0.02770028 0.001502385 0.003978823 1.216774e-05 -0.0002001145 -0.001348026
PC121 PC122 PC123 PC124 PC125 PC126 PC127 PC128
1 0.0090693245 0.0028720788 -0.009482343 0.0020125342 0.003163279 7.754879e-03 -0.001428513 -0.013186625
2 -0.0471299972 -0.0008244673 -0.002573394 0.0042153069 0.019226739 9.021912e-05 0.016527766 0.038475110
3 -0.0174921198 0.0027971131 -0.006183392 0.0056262818 0.002260204 -7.543715e-04 0.009455457 0.018688058
4 -0.0383829844 -0.0059104017 -0.004231240 0.0070146526 0.022775278 7.154643e-03 0.020359758 0.039121884
5 -0.0009289484 -0.0047524705 -0.003141514 -0.0020352140 0.006428257 6.725003e-03 -0.003371201 -0.003652185
6 0.0049073681 0.0005776272 -0.006805929 0.0009515351 -0.005996603 5.281924e-03 -0.002810729 -0.004653305
PC129 PC130 PC131 PC132 PC133 PC134 PC135 PC136
1 0.0006466979 -7.079282e-05 -4.975959e-03 0.0001613765 -0.001505083 -0.020088846 -0.003996740 0.01430148
2 -0.0185976919 4.344553e-03 2.780033e-02 0.0101618662 0.017873468 0.008430215 -0.017623681 -0.01308025
3 0.0139939392 6.666773e-03 -6.742321e-05 -0.0034306545 0.022551929 -0.020373142 0.009075277 0.02991347
4 0.0071775061 -9.405859e-04 2.100842e-02 0.0449713216 0.047790991 -0.038150897 0.003659293 0.03408021
5 -0.0128980913 -3.651087e-02 1.142903e-02 0.0383076302 -0.056153130 0.070196385 -0.038312636 -0.06343444
6 0.0163624973 -9.080050e-03 3.849744e-02 0.0041563346 0.047051444 -0.071112250 0.791279027 -0.58917909
PC137 PC138 PC139
1 0.005103517 0.0107920497 -0.005259204
2 0.013994051 -0.0083018677 -0.003754833
3 -0.014973659 0.0002958576 -0.005487010
4 -0.066239694 -0.0059026199 -0.012544125
5 0.064759595 -0.0860226940 0.002026684
6 -0.089437226 -0.0002017451 0.036715505
top_kos <- head(top_loadings)%>%pull(KEGG)
ko_pathways <- lapply(top_kos, function(k) {
info <- keggGet(paste0("ko:", k))[[1]]
pathways <- info$PATHWAY
if(is.null(pathways)) pathways <- NA
return(pathways)
})
names(ko_pathways) <- top_kos
ko_pathways$K00079
map00590 map00790
"Arachidonic acid metabolism" "Folate biosynthesis"
map00980 map01100
"Metabolism of xenobiotics by cytochrome P450" "Metabolic pathways"
map05204 map05208
"Chemical carcinogenesis - DNA adducts" "Chemical carcinogenesis - reactive oxygen species"
$K00036
map00030 map00480
"Pentose phosphate pathway" "Glutathione metabolism"
map01100 map01110
"Metabolic pathways" "Biosynthesis of secondary metabolites"
map01120 map01200
"Microbial metabolism in diverse environments" "Carbon metabolism"
map05230 map05415
"Central carbon metabolism in cancer" "Diabetic cardiomyopathy"
$K00133
map00260 map00261
"Glycine, serine and threonine metabolism" "Monobactam biosynthesis"
map00270 map00300
"Cysteine and methionine metabolism" "Lysine biosynthesis"
map01100 map01110
"Metabolic pathways" "Biosynthesis of secondary metabolites"
map01120 map01210
"Microbial metabolism in diverse environments" "2-Oxocarboxylic acid metabolism"
map01230
"Biosynthesis of amino acids"
$K00215
map00261 map00300
"Monobactam biosynthesis" "Lysine biosynthesis"
map01100 map01110
"Metabolic pathways" "Biosynthesis of secondary metabolites"
map01120 map01230
"Microbial metabolism in diverse environments" "Biosynthesis of amino acids"
$K00243
[1] NA
$K00567
[1] NA
7.2.3 Testing the differences in KEGG presence/absence with PERMANOVA
kegg_counts_mat <- kegg_counts %>%
column_to_rownames("genome")
kegg_pa <- (kegg_counts_mat > 0) * 1
# remove zero-variance KOs
kegg_pa_nz <- kegg_pa[, colSums(kegg_pa) > 0 & colSums(kegg_pa) < nrow(kegg_pa)]
# Determine the common genomes
common_ids <- base::intersect(rownames(kegg_pa_nz), genome_metadata$ID)
# Report what will be kept/dropped
message("# common: ", length(common_ids))# common: 49
# in KEGG only: 90
# in metadata only: 78
# Subset to the intersection (and keep order identical)
kegg_pa_nz <- kegg_pa_nz[common_ids, , drop = FALSE]
meta <- genome_metadata %>%
dplyr::filter(ID %in% common_ids) %>%
dplyr::distinct(ID, .keep_all = TRUE) %>%
tibble::column_to_rownames("ID") %>%
.[common_ids, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))
# Prepare variables for PERMANOVA
required_vars <- c("genome_size", "completeness", "host_type")
# Coerce types as needed
meta <- meta %>%
dplyr::mutate(
genome_size = as.numeric(genome_size),
completeness = as.numeric(completeness),
host_type = as.factor(host_type)
)
# Align on complete cases (adonis2 drops NAs otherwise)
ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
kegg_pa_nz <- kegg_pa_nz[ok, , drop = FALSE]
meta <- meta[ok, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))
# Distance, dispersion, PERMANOVA
kegg_dist_pa <- vegan::vegdist(kegg_pa_nz, method = "jaccard", binary = TRUE)
disp_pa <- vegan::betadisper(kegg_dist_pa, meta$host_type)
anova(disp_pa)Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.000448 0.00044791 0.3263 0.5706
Residuals 46 0.063141 0.00137264
vegan::adonis2(
kegg_dist_pa ~ genome_size + completeness + host_type,
data = meta,
permutations = 999,
by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
vegan::adonis2(formula = kegg_dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.06534 0.05533 2.8009 0.001 ***
completeness 1 0.02762 0.02339 1.1841 0.238
host_type 1 0.05889 0.04988 2.5247 0.005 **
Residual 44 1.02639 0.86923
Total 47 1.18081 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_pa <- cmdscale(kegg_dist_pa, eig = TRUE, k = 2)
variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)
pcoa_df <- data.frame(
ID = rownames(kegg_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
host_type = meta$host_type
) %>% left_join(genome_metadata, by = "ID")
pcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
geom_point(size = 2) +
scale_color_manual(values = host_type_colors, name = "Host type")+
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host type)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_kegg_papcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
geom_point(size = 2) +
scale_color_manual(values = host_order_colors, name = "Host order")+
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host order)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_kegg_paggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by completeness)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by continent)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)library(patchwork)
p_main <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
scale_color_manual(values = host_type_colors) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) +
theme(
legend.position = "right",
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin = margin(5, 5, 0, 5)
)
cor_value <- cor(pcoa_df$PC1, pcoa_df$completeness, method = "spearman")
cor_pvalue <- cor.test(pcoa_df$PC1, pcoa_df$completeness,
method = "spearman")$p.valueWarning in cor.test.default(pcoa_df$PC1, pcoa_df$completeness, method = "spearman"): Cannot compute exact
p-value with ties
cor_text <- paste0("Spearman ρ = ", round(cor_value, 3),
"\np < ", format.pval(cor_pvalue, digits = 2))
p_completeness_annotated <- ggplot(pcoa_df, aes(x = PC1, y = completeness,
color = host_type.x)) +
scale_color_manual(values = host_type_colors) +
geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
annotate("text",
x = min(pcoa_df$PC1) + 0.1 * diff(range(pcoa_df$PC1)),
y = max(pcoa_df$completeness) - 5,
label = cor_text,
hjust = 0,
size = 3, # Changed from 4 to 3 (smaller)
fontface = "plain") + # Changed from "bold" to "plain"
theme_minimal() +
labs(
y = "Completeness (%)",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)")
) +
theme(
legend.position = "none",
plot.margin = margin(0, 5, 5, 5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5) # Add border
)
p_main <- p_main +
theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
combined_annotated <- p_main / p_completeness_annotated +
plot_layout(heights = c(3, 1), guides = "collect")
print(combined_annotated)`geom_smooth()` using formula = 'y ~ x'
7.3 GIFTs
7.3.1 PCoA with Euclidean distances:
# Convert the GIFTs into a matrix of functional elements per genome (row)
gift_pcoa <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame() %>%
vegdist(method="euclidean") %>%
pcoa() #principal component analysis
#Extract eigenvalues (variance explained by first 10 axes)
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]
# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
as.data.frame() %>%
dplyr::select(Axis.1,Axis.2) # keep the first 2 axes
gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]
#For the black arrows: Functional group loadings
# covariance between each functional trait and pcoa axis scores
#scale with the eigenvectors
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%
diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
as.data.frame() %>%
dplyr::rename(Axis.1=1,Axis.2=2) %>%
rownames_to_column(var="label") %>%
#get function summary vectors
mutate(func=substr(label,1,3)) %>%
group_by(func) %>% #grouping by function
summarise(Axis.1=mean(Axis.1),
Axis.2=mean(Axis.2)) %>%
dplyr::rename(label=func) %>%
filter(!label %in% c("S01","S02","S03"))set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
scale_color_manual(values=host_type_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color = host_type),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = completeness),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = continent),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = host_class),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = host_order),
alpha=0.9, shape=16) +
scale_color_manual(values = host_order_colors)+
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps
centroids <- gift_pcoa_vectors %>%
rownames_to_column(var = "ID") %>%
left_join(genome_metadata, by = "ID") %>%
group_by(host_type) %>%
summarise(
Axis.1 = mean(Axis.1, na.rm = TRUE),
Axis.2 = mean(Axis.2, na.rm = TRUE),
.groups = "drop"
)
gift_pcoa_vectors %>%
rownames_to_column(var = "ID") %>%
left_join(genome_metadata, by = "ID") %>%
ggplot(aes(x = Axis.1, y = Axis.2, color = host_type)) +
geom_point(size = 3, alpha = 0.9) +
scale_color_manual(values = host_type_colors) +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1] * 100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2] * 100, 2), " %)")
) +
coord_cartesian(xlim = c(-0.6, 1), ylim = c(-1, 1)) +
theme_minimal(base_size = 16) +
theme(
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 14),
panel.grid.minor = element_blank()
)Using k-means to cluster the groups and check which MAGs cluster together:
coords <- gift_pcoa_vectors %>%
rownames_to_column("MAG") %>%
as_tibble()
set.seed(123)
km <- kmeans(coords[, c("Axis.1", "Axis.2")], centers = 2, nstart = 25, iter.max = 100)
coords <- coords %>% mutate(cluster = factor(km$cluster))# centroids as a tibble for plotting
centroids <- as_tibble(km$centers) %>% mutate(cluster = factor(1:nrow(km$centers)))
ggplot(coords, aes(x = Axis.1, y = Axis.2, color = cluster)) +
geom_point(size = 2, alpha = 0.8) +
geom_point(data = centroids, aes(x = Axis.1, y = Axis.2, color = cluster),
shape = 4, size = 5, stroke = 1.25) + # X marks centroids
theme_minimal() +
labs(title = paste0("PCoA colored by kmeans (k=2)"),
color = "cluster")$`1`
[1] "EHM066683" "EHM045744" "GCF_027676165.1" "GCA_024461935.1" "GCA_948850205.1" "GCA_015667075.1"
[7] "EHM057831" "GCA_015551225.1" "GCF_027676135.1" "GCA_039060945.1" "GCF_027672785.1" "GCF_050408585.1"
[13] "GCF_039061015.1" "GCF_027673305.1" "GCF_015667075.1" "GCF_027673155.1" "GCA_949462505.1" "GCF_027673085.1"
[19] "GCA_949470665.1" "GCF_027672695.1" "EHM066565" "GCA_948496745.1" "EHM079037" "GCF_027679985.1"
[25] "EHM019886" "GCA_949840595.1" "GCF_015551225.1" "GCF_039060945.1" "GCA_948619675.1" "GCA_050408585.1"
[31] "GCA_948514765.1" "GCF_027672665.1" "GCF_027672965.1" "GCF_024461915.1" "GCF_902364565.1" "GCF_003472365.1"
[37] "GCA_039061015.1" "GCA_024461915.1" "GCA_003472365.1" "GCA_902364565.1" "GCF_027672865.1" "EHM071389"
[43] "GCF_027673485.1" "GCF_027673225.1" "GCF_024461935.1" "EHM028422"
$`2`
[1] "EHM042508" "GCF_030545485.1" "EHM013277" "GCA_948644855.1" "GCF_014830215.1" "GCA_949460315.1"
[7] "GCA_948622745.1" "EHM045766" "GCA_958421465.1" "GCA_947347755.1" "GCF_027674325.1" "GCA_015667015.1"
[13] "EHM027656" "EHM041731" "GCA_018916825.1" "GCA_025128595.1" "GCA_948896515.1" "EHM056720"
[19] "GCF_025128595.1" "GCF_027674685.1" "EHM043079" "GCF_027672725.1" "EHM031001" "GCA_015666905.1"
[25] "GCF_027676035.1" "EHM046444" "EHM045845" "EHM056528" "EHM031663" "GCA_937924215.1"
[31] "EHM028346" "GCA_948672875.1" "EHM025679" "GCF_014334715.1" "GCF_009497135.1" "GCA_001672265.1"
[37] "GCF_021961665.1" "EHM041306" "GCA_020782925.1" "GCF_020782925.1" "GCA_018381915.1" "GCF_027674405.1"
[43] "GCA_020553675.1" "GCF_030480465.1" "GCA_030545485.1" "GCA_949028585.1" "GCF_001672265.1" "GCA_040516025.1"
[49] "GCF_015666905.1" "EHM031160" "GCF_027673065.1" "GCA_014334715.1" "EHM031128" "GCA_014830215.1"
[55] "GCA_948654415.1" "EHM034137" "GCA_030480465.1" "EHM040253" "GCF_015552555.1" "GCA_948544615.1"
[61] "GCA_948700895.1" "GCA_949428005.1" "EHM046343" "EHM057270" "GCA_949439745.1" "GCF_018916825.1"
[67] "GCA_948674145.1" "GCA_949006045.1" "GCF_027672645.1" "GCF_932750895.1" "GCA_009497135.1" "GCA_949444485.1"
[73] "GCF_040516025.1" "GCA_905210045.1" "EHM044953" "GCF_020553675.1" "GCF_027688475.1" "GCF_027661125.1"
[79] "GCF_015667015.1" "EHM056391" "GCA_948748415.1" "GCA_015552555.1" "EHM025059" "GCA_022794425.1"
[85] "GCA_948995465.1" "EHM016594" "EHM020035" "GCA_948741555.1" "GCF_027674205.1" "EHM015868"
[91] "EHM017294" "GCA_948777995.1" "EHM028354" "EHM028200"
7.3.2 Checking each cluster at once
# A tibble: 13 × 45
ID source species gtdb_taxonomy host_species host_order host_class isolation_source host host_summary
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl>
1 EHM019886 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
2 EHM028422 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
3 EHM045744 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
4 EHM057831 EHI s__Lac… <NA> Myotis Bech… Chiroptera Mammalia host-associated <NA> NA
5 EHM066565 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
6 EHM066683 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
7 EHM071389 EHI s__Lac… <NA> Podarcis Fi… Squamata Reptilia host-associated <NA> NA
8 EHM079037 EHI s__Lac… <NA> Myotis Bech… Chiroptera Mammalia host-associated <NA> NA
9 GCF_0155… NCBI <NA> <NA> Homo sapiens Primates Mammalia stool Homo… NA
10 GCF_0156… NCBI <NA> <NA> Homo sapiens Primates Mammalia stool Homo… NA
11 GCF_0390… NCBI <NA> <NA> Homo sapiens Primates Mammalia feces Homo… NA
12 GCF_0390… NCBI <NA> <NA> Homo sapiens Primates Mammalia feces Homo… NA
13 GCF_0504… NCBI <NA> <NA> Actinoptery… <NA> Actinopte… intestine fish NA
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
# genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
# mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
# env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
# disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>,
# host_age <lgl>, host_type <fct>, country_simple <chr>, country_normalized <chr>, continent <chr>
# A tibble: 37 × 45
ID source species gtdb_taxonomy host_species host_order host_class isolation_source host host_summary
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl>
1 EHM013277 EHI s__Lac… <NA> Podarcis Mu… Squamata Reptilia host-associated <NA> NA
2 EHM015868 EHI s__Lac… <NA> Podarcis Mu… Squamata Reptilia host-associated <NA> NA
3 EHM016594 EHI s__Lac… <NA> Hipposidero… Chiroptera Mammalia host-associated <NA> NA
4 EHM017294 EHI s__Lac… <NA> Rhinolophus… Chiroptera Mammalia host-associated <NA> NA
5 EHM020035 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
6 EHM025059 EHI s__Lac… <NA> Podarcis Mu… Squamata Reptilia host-associated <NA> NA
7 EHM025679 EHI s__Lac… <NA> Podarcis Mu… Squamata Reptilia host-associated <NA> NA
8 EHM027656 EHI s__Lac… <NA> Cathartes A… Accipitri… Aves host-associated <NA> NA
9 EHM028200 EHI s__Lac… <NA> Cathartes A… Accipitri… Aves host-associated <NA> NA
10 EHM028346 EHI s__Lac… <NA> Plecotus Au… Chiroptera Mammalia host-associated <NA> NA
# ℹ 27 more rows
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
# genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
# mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
# env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
# disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>, …
genome_metadata <- genome_metadata %>%
dplyr::mutate(
ID_clean = ID %>%
basename() %>%
# remove file extensions
sub("\\.(fna|fa|fasta)(\\.gz)?$", "", ., ignore.case = TRUE) %>%
# remove GB_ or RS_ at the beginning
sub("^(GB_|RS_)", "", ., ignore.case = TRUE) %>%
# extract the real GCA/GCF accession (robust)
sub(".*\\b(GC[AF]_[0-9]+\\.[0-9]+).*", "\\1", ., perl = TRUE)
)metadata_with_cluster <- genome_metadata %>%
left_join(coords %>% dplyr::select(MAG, cluster), by = c("ID_clean" = "MAG"))
metadata_with_cluster %>%
group_by(cluster, host_type) %>%
summarise(n = n(), .groups = "drop") %>%
arrange(cluster, desc(n))# A tibble: 7 × 3
cluster host_type n
<fct> <fct> <int>
1 1 human 21
2 1 animal 16
3 2 animal 58
4 2 human 20
5 2 <NA> 1
6 <NA> animal 10
7 <NA> human 1
# Group summaries
metadata_with_cluster %>%
group_by(cluster) %>%
summarise(mean_genome_size = mean(genome_size, na.rm = TRUE),
median_contamination = median(contamination, na.rm = TRUE),
.groups = "drop")# A tibble: 3 × 3
cluster mean_genome_size median_contamination
<fct> <dbl> <dbl>
1 1 2392085. 0.52
2 2 2415231. 0.35
3 <NA> 2367492. 0.52
# save(ehi_mags,
# phylum_colors,
# genome_annotations,
# genome_gifts,
# contig_to_genome,
# gtdb_metadata,
# ehi_metadata,
# master_index,
# genome_metadata,
# host_type_colors,
# getphylo_tree,
# metadata_with_cluster,
# file = "data/data.Rdata")Is the genome size different between clusters?
Kruskal-Wallis rank sum test
data: genome_size by cluster
Kruskal-Wallis chi-squared = 0.079169, df = 1, p-value = 0.7784
pairwise.wilcox.test(
x = metadata_with_cluster$genome_size,
g = metadata_with_cluster$cluster,
p.adjust.method = "BH" # FDR correction
)
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: metadata_with_cluster$genome_size and metadata_with_cluster$cluster
1
2 0.78
P value adjustment method: BH
Plots
ggplot(metadata_with_cluster, aes(x = genome_size, y = GC, col = cluster))+
geom_point()+
theme_bw()ggplot(metadata_with_cluster, aes(x = cluster, y = genome_size, col = cluster))+
geom_point()+
theme_bw()count_table <- metadata_with_cluster %>%
filter(!is.na(cluster))%>%
dplyr::count(cluster, host_type) %>%
pivot_wider(
names_from = host_type,
values_from = n,
values_fill = 0
)
count_table# A tibble: 2 × 4
cluster animal human `NA`
<fct> <int> <int> <int>
1 1 16 21 0
2 2 58 20 1
mat <- metadata_with_cluster %>%
filter(!is.na(cluster))%>%
dplyr::count(cluster, host_type) %>%
pivot_wider(names_from = host_type, values_from = n, values_fill = 0) %>%
column_to_rownames("cluster") %>%
as.matrix()
chisq.test(mat)$expectedWarning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect
animal human NA
1 23.60345 13.07759 0.3189655
2 50.39655 27.92241 0.6810345
Fisher's Exact Test for Count Data
data: mat
p-value = 0.002253
alternative hypothesis: two.sided
#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)
#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements,GIFT_db)
#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)7.3.3 GIFT community plots
GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ cluster, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")p <- GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ host_type, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")
pggplot2::ggsave(
filename = "plots/GIFT_heatmap_final.png",
plot = p,
width = 14,
height = 16,
units = "in",
dpi = 600,
bg = "white"
)Warning: package 'reshape2' was built under R version 4.4.3
Adjuntando el paquete: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(str_starts(Code_element, "D09")) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
# Ensure trait is a factor to maintain order
mutate(trait = factor(trait, levels = rev(unique(GIFT_db$Element)))) %>%
ggplot( aes(x = Genome, y = trait)) +
# Heatmap
geom_tile(aes(fill = GIFT), color = "white") +
scale_fill_gradientn(colours = brewer.pal(7, "YlGnBu"), name = "GIFT Score") +
# Start a new fill scale for the host_type bar
new_scale_fill() +
# Add the host_type bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = host_type), height = 0.5) +
scale_fill_manual(values = host_type_colors) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_minimal(base_size = 8) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
7.3.4 Checking difference in GIFTs
7.3.4.1 Cluster-wise
Element GIFTs different between clusters of pcoa
# Get only the GIFT columns
gift_cols <- colnames(GIFTs_elements)[!(colnames(GIFTs_elements) %in% c("genome","cluster","host_type"))]
gift_dataframe <- GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var = "ID")
gift_df_meta <- gift_dataframe %>%
left_join(metadata_with_cluster, by = "ID")
# Kruskal-Wallis for 4 groups
kruskal_results <- sapply(gift_cols, function(g) {
kruskal.test(as.formula(paste(g, "~ cluster")), data = gift_df_meta)$p.value
})
# Adjust for multiple testing
kruskal_results_adj <- p.adjust(kruskal_results, method = "BH")
# Combine into a table
kruskal_table <- data.frame(
GIFT = gift_cols,
p_value = kruskal_results,
p_adj = kruskal_results_adj
)
kruskal_table %>% filter(p_adj<0.05) GIFT p_value p_adj
B0601 B0601 5.011737e-04 9.021127e-03
D0908 D0908 2.559625e-12 9.214650e-11
pairwise_results <- lapply(gift_cols, function(g) {
pairwise.wilcox.test(
x = gift_df_meta[[g]],
g = gift_df_meta$cluster,
p.adjust.method = "BH"
)
})
names(pairwise_results) <- gift_cols
pairwise_sig_table <- lapply(names(pairwise_results), function(g) {
# Extract p-value matrix
pmat <- pairwise_results[[g]]$p.value
# Convert to long format
pmat_long <- as.data.frame(as.table(pmat)) %>%
filter(!is.na(Freq)) %>% # remove NA (diagonal / upper triangle)
dplyr::rename(group1 = Var1, group2 = Var2, p_adj = Freq) %>%
filter(p_adj < 0.05) %>%
mutate(GIFT = g)
return(pmat_long)
})
# Combine all GIFTs into one table
pairwise_sig_table <- bind_rows(pairwise_sig_table) %>%
dplyr::select(GIFT, group1, group2, p_adj)
pairwise_sig_table GIFT group1 group2 p_adj
1 B0601 2 1 5.470079e-04
2 B0704 2 1 2.246680e-02
3 D0507 2 1 1.442942e-02
4 D0908 2 1 2.839505e-12
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the host_type bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = host_type), height = 0.3) +
scale_fill_manual(values = host_type_colors, name = "host_type") +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 4),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Code_bundle Code_element Code_function Domain Function Element
1 B060101 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
2 B060102 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
3 B060103 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
4 B060104 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
5 B060105 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
6 B060106 B0601 B06 Biosynthesis Organic anion biosynthesis Succinate
7 B070401 B0704 B07 Biosynthesis Vitamin biosynthesis Pantothenate (B5)
8 B070402 B0704 B07 Biosynthesis Vitamin biosynthesis Pantothenate (B5)
9 D050701 D0507 D05 Degradation Amino acid degradation Leucine
10 D090801 D0908 D09 Degradation Antibiotic degradation Macrolide
Definition
1 (K01647,K05942) (K01681,K01682) (K00031,K00030) ((((K00164+K00658),K01616)+K00382),(K00174+K00175)) ((K01902+K01903),(K01899+K01900),K18118) ((K00234+K00235+K00236+(K00237,K25801)),(K00239+K00240+K00241),(K00244+K00245+K00246)) (K01676,K01679,(K01677+K01678)) (K00026,K00025,K00024,K00116)
2 ((((K00164+K00658),K01616)+K00382),K00174) ((K01902+K01903),(K01899+K01900),K18118) ((K00234+K00235+K00236+(K00237,K25801)),(K00239+K00240+K00241),(K00244+K00245+K00246)) (K01676,K01679,(K01677+K01678)) (K00026,K00025,K00024,K00116)
3 K01580 (K13524,K07250,K00823,K16871) (K00135,K00139,K17761)
4 (K00169+K00170+K00171+K00172) K01007 K01595 K00024 (K01677+K01678) (K00239+K00240) (K01902+K01903) (K15038,K15017) K14465 (K14467,K18861) K14534 K15016 K00626
5 (K02160+K01961+K01962+K01963) K14468 K14469 K15052 K05606 (K01847,(K01848+K01849)) (K14471+K14472) (K00239+K00240+K00241) K01679 K08691 K14449 K14470 K09709
6 (K00169+K00170+K00171+K00172) (K01959+K01960) K00024 (K01677+K01678) (K18209+K18210) (K01902+K01903) (K00174+K00175+K00176+K00177)
7 ((K00826 K00606 K00077),K01579) (K01918,K13799)
8 ((K00606 K00077),(K13367 K00128)) K01918
9 K00826 (((K00166+K00167),K11381)+K09699+K00382) (K00253,K00249) (K01968+K01969) (K05607,K13766) K01640
10 K06979,K08217,K18230,K18231,K21251
library(rstatix)
library(dplyr)
library(purrr)
pairwise_sig_table <- map_df(gift_cols, function(g) {
# Check if there is variation in the GIFT values
# If all values are the same, skip this GIFT
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
# Run the test safely
tryCatch({
results <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ cluster")), p.adjust.method = "BH")
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ cluster")))
results %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
filter(p.adj < 0.05) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL)) # If it still fails, just skip it
})
top_gifts <- pairwise_sig_table %>%
arrange(desc(effsize))
top_gifts# A tibble: 3 × 12
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif effsize magnitude GIFT
<chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr> <dbl> <ord> <chr>
1 B0601 1 2 13 37 166. 0.000547 0.000547 *** 0.492 moderate B0601
2 D0507 1 2 13 37 166 0.014 0.014 * 0.348 moderate D0507
3 B0704 1 2 13 37 192. 0.022 0.022 * 0.326 moderate B0704
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
7.3.4.2 host_type-wise
GIFTs that are different between EHI and GTDB
wilcox_results <- sapply(gift_cols, function(g) {
wilcox.test(as.formula(paste(g, "~ host_type")), data = gift_df_meta)$p.value
})
wilcox_results_adj <- p.adjust(wilcox_results, method = "BH")
data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj
) GIFT p_value p_adj
B0101 B0101 0.553951606 0.7865086
B0102 B0102 0.454067596 0.7865086
B0103 B0103 NaN NaN
B0104 B0104 0.699118745 0.7865086
B0105 B0105 0.819762622 0.8679840
B0106 B0106 0.699118745 0.7865086
B0204 B0204 NaN NaN
B0205 B0205 0.699118745 0.7865086
B0206 B0206 NaN NaN
B0207 B0207 0.699118745 0.7865086
B0208 B0208 0.699118745 0.7865086
B0209 B0209 0.699118745 0.7865086
B0210 B0210 0.699118745 0.7865086
B0211 B0211 0.669187269 0.7865086
B0212 B0212 NaN NaN
B0213 B0213 NaN NaN
B0214 B0214 NaN NaN
B0215 B0215 0.315200758 0.7865086
B0216 B0216 NaN NaN
B0217 B0217 NaN NaN
B0218 B0218 NaN NaN
B0219 B0219 0.699118745 0.7865086
B0220 B0220 NaN NaN
B0221 B0221 NaN NaN
B0302 B0302 NaN NaN
B0303 B0303 NaN NaN
B0307 B0307 0.699118745 0.7865086
B0309 B0309 NaN NaN
B0310 B0310 NaN NaN
B0401 B0401 NaN NaN
B0402 B0402 NaN NaN
B0403 B0403 NaN NaN
B0601 B0601 0.377067975 0.7865086
B0602 B0602 NaN NaN
B0603 B0603 0.699118745 0.7865086
B0604 B0604 NaN NaN
B0605 B0605 NaN NaN
B0701 B0701 0.553951606 0.7865086
B0702 B0702 NaN NaN
B0703 B0703 NaN NaN
B0704 B0704 0.376743600 0.7865086
B0705 B0705 NaN NaN
B0706 B0706 NaN NaN
B0707 B0707 0.699118745 0.7865086
B0708 B0708 NaN NaN
B0709 B0709 NaN NaN
B0710 B0710 NaN NaN
B0711 B0711 NaN NaN
B0712 B0712 NaN NaN
B0801 B0801 NaN NaN
B0802 B0802 NaN NaN
B0803 B0803 NaN NaN
B0804 B0804 NaN NaN
B0805 B0805 NaN NaN
B0901 B0901 NaN NaN
B0902 B0902 NaN NaN
B0903 B0903 NaN NaN
B1004 B1004 NaN NaN
B1006 B1006 NaN NaN
B1008 B1008 NaN NaN
B1011 B1011 NaN NaN
B1012 B1012 NaN NaN
B1014 B1014 NaN NaN
B1021 B1021 NaN NaN
B1022 B1022 NaN NaN
B1024 B1024 NaN NaN
B1026 B1026 NaN NaN
B1028 B1028 0.466154044 0.7865086
B1029 B1029 NaN NaN
B1041 B1041 0.376743600 0.7865086
B1042 B1042 NaN NaN
D0101 D0101 0.699118745 0.7865086
D0102 D0102 0.699118745 0.7865086
D0103 D0103 NaN NaN
D0104 D0104 NaN NaN
D0201 D0201 NaN NaN
D0202 D0202 NaN NaN
D0203 D0203 NaN NaN
D0204 D0204 NaN NaN
D0205 D0205 NaN NaN
D0206 D0206 NaN NaN
D0207 D0207 NaN NaN
D0208 D0208 NaN NaN
D0209 D0209 NaN NaN
D0210 D0210 NaN NaN
D0211 D0211 NaN NaN
D0212 D0212 NaN NaN
D0213 D0213 NaN NaN
D0301 D0301 NaN NaN
D0302 D0302 NaN NaN
D0303 D0303 NaN NaN
D0304 D0304 NaN NaN
D0305 D0305 NaN NaN
D0306 D0306 NaN NaN
D0307 D0307 NaN NaN
D0308 D0308 NaN NaN
D0309 D0309 1.000000000 1.0000000
D0310 D0310 NaN NaN
D0401 D0401 NaN NaN
D0402 D0402 NaN NaN
D0403 D0403 NaN NaN
D0404 D0404 NaN NaN
D0405 D0405 NaN NaN
D0406 D0406 NaN NaN
D0407 D0407 NaN NaN
D0408 D0408 NaN NaN
D0501 D0501 NaN NaN
D0502 D0502 NaN NaN
D0503 D0503 NaN NaN
D0504 D0504 NaN NaN
D0505 D0505 NaN NaN
D0506 D0506 NaN NaN
D0507 D0507 0.733176720 0.7998291
D0508 D0508 0.006259432 0.2253395
D0509 D0509 0.699118745 0.7865086
D0510 D0510 NaN NaN
D0511 D0511 NaN NaN
D0512 D0512 NaN NaN
D0513 D0513 NaN NaN
D0516 D0516 NaN NaN
D0517 D0517 NaN NaN
D0518 D0518 NaN NaN
D0601 D0601 NaN NaN
D0602 D0602 NaN NaN
D0603 D0603 NaN NaN
D0604 D0604 NaN NaN
D0606 D0606 NaN NaN
D0607 D0607 NaN NaN
D0608 D0608 NaN NaN
D0609 D0609 NaN NaN
D0610 D0610 NaN NaN
D0611 D0611 NaN NaN
D0612 D0612 NaN NaN
D0613 D0613 NaN NaN
D0701 D0701 NaN NaN
D0702 D0702 NaN NaN
D0704 D0704 NaN NaN
D0705 D0705 NaN NaN
D0706 D0706 NaN NaN
D0708 D0708 NaN NaN
D0709 D0709 NaN NaN
D0801 D0801 NaN NaN
D0802 D0802 NaN NaN
D0804 D0804 0.027200351 0.4896063
D0805 D0805 NaN NaN
D0806 D0806 NaN NaN
D0807 D0807 0.393627743 0.7865086
D0808 D0808 NaN NaN
D0809 D0809 NaN NaN
D0810 D0810 NaN NaN
D0811 D0811 NaN NaN
D0812 D0812 NaN NaN
D0813 D0813 NaN NaN
D0814 D0814 NaN NaN
D0815 D0815 NaN NaN
D0816 D0816 0.061141985 0.5769099
D0817 D0817 NaN NaN
D0818 D0818 NaN NaN
D0819 D0819 NaN NaN
D0901 D0901 NaN NaN
D0902 D0902 NaN NaN
D0903 D0903 NaN NaN
D0904 D0904 NaN NaN
D0905 D0905 NaN NaN
D0906 D0906 0.553951606 0.7865086
D0907 D0907 NaN NaN
D0908 D0908 0.107578984 0.7745687
D0910 D0910 NaN NaN
D0911 D0911 NaN NaN
D0912 D0912 NaN NaN
S0101 S0101 NaN NaN
S0103 S0103 NaN NaN
S0104 S0104 NaN NaN
S0105 S0105 0.064101099 0.5769099
S0201 S0201 0.699118745 0.7865086
S0202 S0202 1.000000000 1.0000000
S0301 S0301 0.699118745 0.7865086
effect_size <- sapply(gift_cols, function(g) {
median(gift_df_meta[[g]][gift_df_meta$host_type=="human"]) -
median(gift_df_meta[[g]][gift_df_meta$host_type=="animal"])
})
wilcox_res_host_type <- data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj,
effect = effect_size
)
wilcox_res_host_type %>% dplyr::select(GIFT, p_adj, effect) %>% filter(p_adj<0.05)[1] GIFT p_adj effect
<0 rows> (o 0- extensión row.names)
Code_bundle Code_element Code_function Domain Function Element
1 B021501 B0215 B02 Biosynthesis Amino acid biosynthesis Histidine
2 B070101 B0701 B07 Biosynthesis Vitamin biosynthesis Thiamine (B1)
3 B070102 B0701 B07 Biosynthesis Vitamin biosynthesis Thiamine (B1)
4 B070103 B0701 B07 Biosynthesis Vitamin biosynthesis Thiamine (B1)
5 B070104 B0701 B07 Biosynthesis Vitamin biosynthesis Thiamine (B1)
6 D050801 D0508 D05 Degradation Amino acid degradation Lysine
7 D050802 D0508 D05 Degradation Amino acid degradation Lysine
8 D050803 D0508 D05 Degradation Amino acid degradation Lysine
9 D050804 D0508 D05 Degradation Amino acid degradation Lysine
10 D050805 D0508 D05 Degradation Amino acid degradation Lysine
11 D050806 D0508 D05 Degradation Amino acid degradation Lysine
Definition
1 K00765 ((K01523 K01496),K11755,K14152) (K01814,K24017) ((K02501+K02500),K01663) ((K01693 K00817 (K04486,K05602,K18649)),(K01089 K00817)) (K00013,K14152)
2 (((K03148+K03154) K03151),(K03150 K03149)) K03147 ((K00941 K00788),K14153,K21219) K00946
3 (((K03148+K03154) K03151),(K03153 K03149 K10810)) K03147 K00941 K00788 K00946
4 (K22699,K03147) ((K00941 (K00788,K21220)),K21219) K00946
5 (K00941 K00788),K14153,K21219
6 4.1.1.18 2.6.1.82 1.2.1.19 2.6.1.48 1.2.1.20 1.14.11.64 1.1.5.13
7 1.13.12.2 3.5.1.30 1.6.1.48 1.2.1.20 2.8.3.13
8 2.6.1.36 1.2.1.31
9 4.1.1.18 2.6.1.82 1.2.1.19 2.6.1.48 1.2.1.20 2.8.3.13
10 K01582 K09251 K00137 K07250 K00135 K15737 K15736
11 K00468 K01506 (K14268,K07250) K00135 ((K15737 K15736),(K01041 K00252 (K01692,K01825,K01782) (K01825,K01782) K00626))
wilcox_res_host_type <- map_df(gift_cols, function(g) {
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
tryCatch({
# Calculate p-value
res <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ host_type")))
# Calculate effect size (Rank-Biserial Correlation)
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ host_type")))
# Combine results
res %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL))
})
# Adjust P-values and filter
wilcox_res_host_type <- wilcox_res_host_type %>%
mutate(p_adj = p.adjust(p, method = "BH")) %>%
dplyr::select(GIFT, group1, group2, p_adj, effsize, magnitude) %>%
filter(p_adj < 0.05) %>%
arrange(desc(abs(effsize))) # Sort by strongest effect
# View the top results
print(wilcox_res_host_type)# A tibble: 0 × 6
# ℹ 6 variables: GIFT <chr>, group1 <chr>, group2 <chr>, p_adj <dbl>, effsize <dbl>, magnitude <ord>
host_class_colors <- c(
"Mammalia" = "#8B1E3F",
"Reptilia" = "#3A7D44",
"Aves" = "#E1B12C"
)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% wilcox_res_host_type$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the host_type bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = host_class), height = 0.5) +
facet_grid(~ host_type, scales = "free_x", space = "free_x") +
scale_fill_manual(values = host_class_colors, name = "Host class")
theme(
axis.text.x = element_blank()
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")