Chapter 7 Functional Analysis

7.1 Functional overview

n_genes <- genome_annotations %>%
  group_by(genome) %>%
  summarize(n_genes = n())

head(n_genes)
# A tibble: 6 × 2
  genome    n_genes
  <chr>       <int>
1 EHM013277    2223
2 EHM015868    2322
3 EHM016594    2131
4 EHM017294    2516
5 EHM019886    2410
6 EHM020035    2296

7.1.0.1 Predicted genes

pred_genes <- genome_annotations %>%
  nrow()

cat(pred_genes)
335828

7.1.0.2 Number of annotated genes and percentages

#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
  filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
  nrow()

cat(genome_annota)
281627
#Percentage of predicted genes with at least 1 annotation
genome_annota*100/pred_genes
[1] 83.86049

7.1.0.3 Number of KEGG annotatated genes and percentages

# KEGG annotation
kegg_annota <- genome_annotations %>%
  filter(!is.na(kegg)) %>%
  nrow()
cat(kegg_annota)
235205
# KEGG annotation percentage
kegg_annota*100/genome_annota
[1] 83.5165
# AMR annotation
amr_annota <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  nrow()
cat(amr_annota)
24377
# AMR annotation percentage
amr_annota*100/genome_annota
[1] 8.655775
# VF annotation
vf_annota <- genome_annotations %>%
  filter(!is.na(vf)) %>%
  nrow()
cat(vf_annota)
62759
# VF annotation percentage
vf_annota*100/genome_annota
[1] 22.28444
n_pred_genes <- genome_annotations %>%
  group_by(genome) %>%
  summarize(n_genes = n()) %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )

annotated_genes <- genome_annotations %>%
  filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
  group_by(genome) %>%
  summarize(n_annotated_genes = n()) %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )
ggplot(n_pred_genes, aes(x= host_type, y = n_genes, fill = host_type))+
  scale_fill_manual(values = host_type_colors)+
  geom_boxplot()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Gene Number", x = "Host Type")

ggplot(annotated_genes, aes(x= host_type, y = n_annotated_genes, fill = host_type))+
  scale_fill_manual(values = host_type_colors)+
  geom_boxplot()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Annotated Gene Number", x = "Host Type")

wilcox.test(n_genes ~ host_type, data=n_pred_genes)  %>%
  tidy()
Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot compute exact p-value with ties
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1       111   0.156 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(n_annotated_genes ~ host_type, data=annotated_genes)  %>%
  tidy()
Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot compute exact p-value with ties
# A tibble: 1 × 4
  statistic p.value method                                            alternative
      <dbl>   <dbl> <chr>                                             <chr>      
1        97  0.0720 Wilcoxon rank sum test with continuity correction two.sided  

7.2 KEGG

7.2.1 Nº of MAGs with KOs

#Proportion of MAGs belonging to each group per KEGG
#I want to know for example 5% of MAGs from anials have this KEGG, but 40 % of MAGs from humans have it

# KEGG presence/absence
kegg_presence <- genome_annotations %>%
  filter(!is.na(kegg)) %>%
  dplyr::select(genome, kegg) %>%
  distinct() 

#Add the host_type info
kegg_with_host_type <- kegg_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )

# Count how many MAGs in each KEGG
kegg_mag_counts <- kegg_with_host_type %>%
  group_by(host_type, kegg) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per host_type (except the outlier)
total_mags_per_host_type <- genome_metadata %>%
  group_by(host_type) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each host_type in each KEGG
kegg_mag_proportions <- kegg_mag_counts %>%
  left_join(total_mags_per_host_type, by = "host_type") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

7.2.1.1 Plotting KEGG MAG proportions

kegg_mag_proportions %>%
  filter(n_mags >= 20)
# A tibble: 2,692 × 6
   host_type kegg   n_mags total_mags proportion absent
   <chr>     <chr>   <int>      <int>      <dbl>  <int>
 1 animal    K00003     40         84      0.476     44
 2 animal    K00004     38         84      0.452     46
 3 animal    K00009     39         84      0.464     45
 4 animal    K00013     41         84      0.488     43
 5 animal    K00014     41         84      0.488     43
 6 animal    K00016     41         84      0.488     43
 7 animal    K00020     41         84      0.488     43
 8 animal    K00030     41         84      0.488     43
 9 animal    K00033     41         84      0.488     43
10 animal    K00036     41         84      0.488     43
# ℹ 2,682 more rows

7.2.1.2 Statistical testing of MAG proportions

fisher_results <- kegg_mag_proportions %>%
  dplyr::select(kegg, host_type, n_mags, absent) %>%
  pivot_wider(
    names_from = host_type,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_animal,  absent_animal,
          n_mags_human, absent_human),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_animal  = n_mags_animal  / (n_mags_animal  + absent_animal),
    prop_human = n_mags_human / (n_mags_human + absent_human),
    diff_prop = prop_human - prop_animal
  )
volcano_df <- fisher_results %>%
  # Keep rows with non-missing p_adj and diff_prop
  filter(!is.na(p_adj), !is.na(diff_prop)) %>%
  # Avoid Inf y-values if p_adj == 0 by flooring at 1e-300 (or your choice)
  mutate(
    p_adj_capped = pmax(p_adj, 1e-300),
    nl10 = -log10(p_adj_capped),
    sig = p_adj < 0.05
  )


to_label <- volcano_df %>%
  filter(sig) %>%
  arrange(desc(nl10)) %>%
  dplyr::slice(1:10)

ggplot(volcano_df, aes(x = diff_prop, y = nl10)) +
  geom_point(aes(color = sig), alpha = 0.8, size = 2) +
  geom_text_repel(
    data = to_label,
    aes(label = kegg),        # or `Name`
    max.overlaps = 20,
    size = 3,
    segment.color = "grey70"
  ) +
  scale_color_manual(values = c(`TRUE` = "#1f77b4", `FALSE` = "grey70"), name ="p_adj < 0.05") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
  theme_minimal() +
  labs(
    x = "Effect size (diff_prop)",
    y = expression(-log[10](p[adj])),
    title = "Volcano plot: differential KEGG presence"
  )

# Ensure host_type has the intended order
genome_metadata <- genome_metadata %>%
  mutate(host_type = factor(host_type, levels = c("animal", "human")))

# 1) KEGG presence (unique genome–kegg)
kegg_presence <- genome_annotations %>%
  dplyr::select(genome, kegg) %>%
  distinct() %>%
  filter(!is.na(kegg))

# 2) Add host type
kegg_with_host <- kegg_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, host_type),
    by = c("genome" = "ID")
  )

# 3) Count present per (kegg, host_type)
kegg_counts <- kegg_with_host %>%
  group_by(kegg, host_type) %>%
  summarise(n_present = n_distinct(genome), .groups = "drop")

# 4) Totals per host_type
totals <- genome_metadata %>%
  group_by(host_type) %>%
  summarise(total_mags = n_distinct(ID), .groups = "drop")

# 5) Complete grid and compute absent correctly
kegg_counts_complete <- kegg_counts %>%
  tidyr::complete(kegg, host_type, fill = list(n_present = 0)) %>%
  left_join(totals, by = "host_type") %>%
  mutate(n_absent = total_mags - n_present)

# 6) Pivot to 2×2 format
wide <- kegg_counts_complete %>%
  dplyr::select(kegg, host_type, n_present, n_absent) %>%
  pivot_wider(
    id_cols = kegg,
    names_from = host_type,
    values_from = c(n_present, n_absent),
    values_fill = 0
  )

# Sanity: ensure expected columns exist (adjust names if your levels differ)
stopifnot(all(c("n_present_animal","n_absent_animal","n_present_human","n_absent_human") %in% names(wide)))

wide <- wide %>%
  mutate(
    n_animal = n_present_animal,
    a_animal = n_absent_animal,
    n_human  = n_present_human,
    a_human  = n_absent_human
  )

# 7) SAFE P-VALUE: robust computation avoiding FEXACT 40
safe_fisher_p <- function(n_animal, a_animal, n_human, a_human) {
    m <- matrix(c(n_animal, a_animal, n_human, a_human), nrow = 2, byrow = TRUE)

    # If any row or column sums to zero, Fisher is fine (returns 1) or undefined;
    # handle degenerate cases: if all zero or all present, return p=1
    if (any(rowSums(m) == 0) || any(colSums(m) == 0)) {
        return(1.0)
    }

    # 1) Try exact Fisher with large workspace
    p1 <- tryCatch(
        fisher.test(m, workspace = 2e8)$p.value,
        error = function(e) NA_real_
    )
    if (!is.na(p1)) return(p1)

    # 2) Fall back to simulated Fisher
    p2 <- tryCatch(
        fisher.test(m, simulate.p.value = TRUE, B = 1e6)$p.value,
        error = function(e) NA_real_
    )
    if (!is.na(p2)) return(p2)

    # 3) Last resort: chi-square with Yates correction
    p3 <- tryCatch(
        suppressWarnings(chisq.test(m, correct = TRUE)$p.value),
        error = function(e) NA_real_
    )
    if (!is.na(p3)) return(p3)

    # If everything fails, return NA
    return(NA_real_)
}

# 8) Compute stats rowwise with robust p-values
fisher_df <- wide %>%
  rowwise() %>%
  mutate(
    # p-value
    p_value = safe_fisher_p(n_animal, a_animal, n_human, a_human),

    # proportions and difference
    prop_animal = ifelse((n_animal + a_animal) > 0, n_animal / (n_animal + a_animal), NA_real_),
    prop_human  = ifelse((n_human  + a_human)  > 0, n_human  / (n_human  + a_human),  NA_real_),
    diff_prop   = prop_human - prop_animal,

    # Haldane–Anscombe for OR if any zero cell (effect size only)
    needs_ha    = (n_animal == 0L) | (a_animal == 0L) | (n_human == 0L) | (a_human == 0L),
    n_animal_ha = ifelse(needs_ha, n_animal + 0.5, n_animal),
    a_animal_ha = ifelse(needs_ha, a_animal + 0.5, a_animal),
    n_human_ha  = ifelse(needs_ha, n_human  + 0.5, n_human),
    a_human_ha  = ifelse(needs_ha, a_human  + 0.5, a_human),

    or      = (n_human_ha * a_animal_ha) / (a_human_ha * n_animal_ha),
    log2_or = log2(or),

    # Relative risk with light Laplace smoothing (fold-change of proportions)
    rr      = ((n_human + 0.5)  / (n_human + a_human + 1)) /
              ((n_animal + 0.5) / (n_animal + a_animal + 1)),
    log2_rr = log2(rr)
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

# 9) Optional prevalence filter to stabilize estimates
fisher_df <- fisher_df %>%
  mutate(total_present = n_animal + n_human) %>%
  filter(
    total_present >= 5 |
      prop_animal >= 0.03 | prop_human >= 0.03
  )

# 10) Volcano (log2 OR)
volcano_df <- fisher_df %>%
  filter(!is.na(p_adj), !is.na(log2_or)) %>%
  mutate(
    p_adj_capped = pmax(p_adj, 1e-300),
    nl10 = -log10(p_adj_capped),
    sig = p_adj < 0.05
  )

to_label <- volcano_df %>%
  filter(sig) %>%
  arrange(desc(nl10)) %>%
  dplyr::slice(1:10)

ggplot(volcano_df, aes(x = log2_or, y = nl10)) +
  geom_point(aes(color = sig), alpha = 0.8, size = 2) +
  geom_text_repel(
    data = to_label,
    aes(label = kegg),
    max.overlaps = 20,
    size = 3,
    segment.color = "grey70"
  ) +
  scale_color_manual(values = c(`TRUE` = "#1f77b4", `FALSE` = "grey70"), name = "FDR < 0.05") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = c(-1, 1), linetype = "dotted", color = "grey60") + # OR ~0.5 and ~2
  theme_minimal() +
  labs(
    x = "Effect size (log2 odds ratio, human vs animal)",
    y = expression(-log[10](FDR)),
    title = "Volcano plot: differential KEGG presence (human vs animal)"
  )

# 11) Top 20 by significance then effect size
top20 <- fisher_df %>%
  arrange(p_adj, desc(abs(log2_or))) %>%
  dplyr::select(
    kegg,
    n_human, a_human, prop_human,
    n_animal, a_animal, prop_animal,
    diff_prop, log2_rr, log2_or, p_value, p_adj
  ) %>%
  dplyr::slice(1:20)

top20
# A tibble: 20 × 12
   kegg   n_human a_human prop_human n_animal a_animal prop_animal diff_prop log2_rr log2_or   p_value   p_adj
   <chr>    <int>   <int>      <dbl>    <int>    <int>       <dbl>     <dbl>   <dbl>   <dbl>     <dbl>   <dbl>
 1 K12268       0      42     0            19       65       0.226    -0.226   -4.30   -4.66 0.000323  0.00356
 2 K12270       0      42     0            19       65       0.226    -0.226   -4.30   -4.66 0.000323  0.00356
 3 K18967       1      41     0.0238       20       64       0.238    -0.214   -2.79   -3.68 0.00179   0.00356
 4 K20485       3      39     0.0714       33       51       0.393    -0.321   -2.28   -3.07 0.000119  0.00356
 5 K07114       4      38     0.0952       39       45       0.464    -0.369   -2.15   -3.04 0.0000222 0.00356
 6 K10242       3      39     0.0714       30       54       0.357    -0.286   -2.14   -2.85 0.000467  0.00356
 7 K13789       4      38     0.0952       34       50       0.405    -0.310   -1.96   -2.69 0.000363  0.00356
 8 K06075       5      37     0.119        39       45       0.464    -0.345   -1.86   -2.68 0.000127  0.00356
 9 K13730       4      38     0.0952       33       51       0.393    -0.298   -1.91   -2.62 0.000398  0.00356
10 K07001       5      37     0.119        38       46       0.452    -0.333   -1.82   -2.61 0.000145  0.00356
11 K01235       4      38     0.0952       32       52       0.381    -0.286   -1.87   -2.55 0.000712  0.00356
12 K08217       4      38     0.0952       32       52       0.381    -0.286   -1.87   -2.55 0.000712  0.00356
13 K15531       4      38     0.0952       32       52       0.381    -0.286   -1.87   -2.55 0.000712  0.00356
14 K01854       5      37     0.119        36       48       0.429    -0.310   -1.75   -2.47 0.000502  0.00356
15 K09155       5      37     0.119        36       48       0.429    -0.310   -1.75   -2.47 0.000502  0.00356
16 K16028       5      37     0.119        36       48       0.429    -0.310   -1.75   -2.47 0.000502  0.00356
17 K07005       6      36     0.143        40       44       0.476    -0.333   -1.66   -2.45 0.000196  0.00356
18 K21739       6      36     0.143        40       44       0.476    -0.333   -1.66   -2.45 0.000196  0.00356
19 K22980       6      36     0.143        39       45       0.464    -0.321   -1.62   -2.38 0.000351  0.00356
20 K20374       5      37     0.119        34       50       0.405    -0.286   -1.67   -2.33 0.000993  0.00356
library(KEGGREST)

# Function to get info from KEGG
get_kegg_info <- function(ko_id) {
  query <- keggGet(ko_id)[[1]]
  
  # Extract Name and Definition
  name <- ifelse(!is.null(query$NAME), query$NAME, "N/A")
  definition <- ifelse(!is.null(query$DEFINITION), query$DEFINITION, "N/A")
  
  # Extract Pathways (often multiple)
  pathways <- if(!is.null(query$PATHWAY)) {
    paste(names(query$PATHWAY), query$PATHWAY, sep = ": ", collapse = "; ")
  } else {
    "No pathway assigned"
  }
  
  return(c(KO = ko_id, Name = name, Definition = definition, Pathways = pathways))
}
ko_list <- lapply(top20$kegg, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_list))

print(ko_summary)
       KO                                                                           Name Definition
1  K12268                                               accessory secretory protein Asp1        N/A
2  K12270                                               accessory secretory protein Asp3        N/A
3  K18967                                              diguanylate cyclase [EC:2.7.7.65]        N/A
4  K20485                         ATP-binding cassette, subfamily B, bacterial NisT/SpaT        N/A
5  K07114                                          Ca-activated chloride channel homolog        N/A
6  K10242                                   cellobiose transport system permease protein        N/A
7  K13789    geranylgeranyl diphosphate synthase, type II [EC:2.5.1.1 2.5.1.10 2.5.1.29]        N/A
8  K06075 MarR family transcriptional regulator, transcriptional regulator for hemolysin        N/A
9  K13730                                                                   internalin A        N/A
10 K07001                                                             NTE family protein        N/A
11 K01235                                             alpha-glucuronidase [EC:3.2.1.139]        N/A
12 K08217                         MFS transporter, DHA3 family, macrolide efflux protein        N/A
13 K15531                           oligosaccharide reducing-end xylanase [EC:3.2.1.156]        N/A
14 K01854                                       UDP-galactopyranose mutase [EC:5.4.99.9]        N/A
15 K09155                                                        uncharacterized protein        N/A
16 K16028                                                            O-methyltransferase        N/A
17 K07005                                                        uncharacterized protein        N/A
18 K21739                          probable pyridine nucleotide-disulfide oxidoreductase        N/A
19 K22980                                      capsule synthesis positive regulator AcpB        N/A
20 K20374                  HTH-type transcriptional regulator, SHP3-responsive repressor        N/A
                                                                                                                                                           Pathways
1                                                                                                                                               No pathway assigned
2                                                                                                                                               No pathway assigned
3                                                                                                                                               No pathway assigned
4                                                                                                          map02020: Two-component system; map02024: Quorum sensing
5                                                                                                                                               No pathway assigned
6                                                                                                                                        map02010: ABC transporters
7                                          map00900: Terpenoid backbone biosynthesis; map01100: Metabolic pathways; map01110: Biosynthesis of secondary metabolites
8                                                                                                                                               No pathway assigned
9                                                                                                                  map05100: Bacterial invasion of epithelial cells
10                                                                                                                                              No pathway assigned
11                                                                                                                                              No pathway assigned
12                                                                                                                                              No pathway assigned
13                                                                                                                                              No pathway assigned
14 map00052: Galactose metabolism; map00520: Amino sugar and nucleotide sugar metabolism; map01100: Metabolic pathways; map01250: Biosynthesis of nucleotide sugars
15                                                                                                                                              No pathway assigned
16                                                                                     map01051: Biosynthesis of ansamycins; map01052: Type I polyketide structures
17                                                                                                                                              No pathway assigned
18                                                                                                                                              No pathway assigned
19                                                                                                                                              No pathway assigned
20                                                                                                                                         map02024: Quorum sensing
fish_results_names <- left_join(top20, ko_summary, join_by(kegg== KO))


heatmap_data <- fish_results_names %>%
  dplyr::select(Name, prop_animal, prop_human)%>%
  pivot_longer(!Name, names_to = "host_type", values_to= "proportion" ) %>%
   mutate(
    host_type = case_when(
      host_type == "prop_animal"  ~ "animal",
      host_type == "prop_human" ~ "human",
      TRUE ~ host_type
    )
  )
# 1) Reduce to one row per Name × host_type
heatmap_summary <- heatmap_data %>%
  group_by(Name, host_type) %>%
  summarise(
    proportion = mean(proportion, na.rm = TRUE),  # or median(), max(), first()
    .groups = "drop"
  )

# 2) Compute diff = animal - human in wide form
heatmap_wide <- heatmap_summary %>%
  filter(host_type %in% c("animal", "human")) %>%  # just in case there are other types
  tidyr::pivot_wider(
    names_from = host_type,
    values_from = proportion
  ) %>%
  mutate(
    diff = animal - human
  )

# 3) Reorder Name by diff and bring back to long for plotting
heatmap_plot <- heatmap_wide %>%
  mutate(Name = forcats::fct_reorder(Name, diff)) %>%
  tidyr::pivot_longer(
    cols = c(animal, human),
    names_to = "host_type",
    values_to = "proportion"
  )

# 4) Plot — heatmap
ggplot(heatmap_plot,
       aes(x = host_type, y = Name, fill = proportion)) +
  geom_tile() +
  geom_text(aes(label = scales::number(proportion, accuracy = 0.01)),
            size = 6) +
  scale_fill_viridis_c(
    name = "Proportion of MAGs",
    limits = c(0, 1)
  ) +
  theme_minimal() +
  labs(
    x = "host_type",
    y = "KEGG ortholog",
    title = "Differential KEGG presence between animal and human"
  )

table_data <- fish_results_names %>%
  transmute(
    `KEGG ortholog` = Name,
    `Proportion in animal`  = prop_animal,
    `Proportion in human` = prop_human,
    `Δ (human - animal)`  =  prop_human - prop_animal ,
    `Log2(Odds Ratio)`  =  log2_or ,
    `Fisher p-value`  = p_adj
  ) %>%
  arrange(desc(abs(`Δ (human - animal)`)))

table_data
# A tibble: 20 × 6
   `KEGG ortholog`          `Proportion in animal` `Proportion in human` `Δ (human - animal)` `Log2(Odds Ratio)`
   <chr>                                     <dbl>                 <dbl>                <dbl>              <dbl>
 1 Ca-activated chloride c…                  0.464                0.0952               -0.369              -3.04
 2 MarR family transcripti…                  0.464                0.119                -0.345              -2.68
 3 NTE family protein                        0.452                0.119                -0.333              -2.61
 4 uncharacterized protein                   0.476                0.143                -0.333              -2.45
 5 probable pyridine nucle…                  0.476                0.143                -0.333              -2.45
 6 capsule synthesis posit…                  0.464                0.143                -0.321              -2.38
 7 ATP-binding cassette, s…                  0.393                0.0714               -0.321              -3.07
 8 geranylgeranyl diphosph…                  0.405                0.0952               -0.310              -2.69
 9 UDP-galactopyranose mut…                  0.429                0.119                -0.310              -2.47
10 uncharacterized protein                   0.429                0.119                -0.310              -2.47
11 O-methyltransferase                       0.429                0.119                -0.310              -2.47
12 internalin A                              0.393                0.0952               -0.298              -2.62
13 cellobiose transport sy…                  0.357                0.0714               -0.286              -2.85
14 alpha-glucuronidase [EC…                  0.381                0.0952               -0.286              -2.55
15 MFS transporter, DHA3 f…                  0.381                0.0952               -0.286              -2.55
16 oligosaccharide reducin…                  0.381                0.0952               -0.286              -2.55
17 HTH-type transcriptiona…                  0.405                0.119                -0.286              -2.33
18 accessory secretory pro…                  0.226                0                    -0.226              -4.66
19 accessory secretory pro…                  0.226                0                    -0.226              -4.66
20 diguanylate cyclase [EC…                  0.238                0.0238               -0.214              -3.68
# ℹ 1 more variable: `Fisher p-value` <dbl>
library(gt)
Warning: package 'gt' was built under R version 4.4.3

Adjuntando el paquete: 'gt'
The following object is masked from 'package:ape':

    where
table_data %>%
  gt() %>%
  fmt_number(
    columns = c(`Proportion in animal`, `Proportion in human`, `Δ (human - animal)`, `Log2(Odds Ratio)` ),
    decimals = 2
  ) %>%
  fmt_scientific(
    columns = `Fisher p-value`,
    decimals = 2
  ) %>%
  tab_header(
    title = "KEGG orthologs differing between animal and human MAGs"
  ) %>%
  cols_align(
    align = "center",
    everything()
  )%>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels(everything())
  )
KEGG orthologs differing between animal and human MAGs
KEGG ortholog Proportion in animal Proportion in human Δ (human - animal) Log2(Odds Ratio) Fisher p-value
Ca-activated chloride channel homolog 0.46 0.10 −0.37 −3.04 3.56 × 10−3
MarR family transcriptional regulator, transcriptional regulator for hemolysin 0.46 0.12 −0.35 −2.68 3.56 × 10−3
NTE family protein 0.45 0.12 −0.33 −2.61 3.56 × 10−3
uncharacterized protein 0.48 0.14 −0.33 −2.45 3.56 × 10−3
probable pyridine nucleotide-disulfide oxidoreductase 0.48 0.14 −0.33 −2.45 3.56 × 10−3
capsule synthesis positive regulator AcpB 0.46 0.14 −0.32 −2.38 3.56 × 10−3
ATP-binding cassette, subfamily B, bacterial NisT/SpaT 0.39 0.07 −0.32 −3.07 3.56 × 10−3
geranylgeranyl diphosphate synthase, type II [EC:2.5.1.1 2.5.1.10 2.5.1.29] 0.40 0.10 −0.31 −2.69 3.56 × 10−3
UDP-galactopyranose mutase [EC:5.4.99.9] 0.43 0.12 −0.31 −2.47 3.56 × 10−3
uncharacterized protein 0.43 0.12 −0.31 −2.47 3.56 × 10−3
O-methyltransferase 0.43 0.12 −0.31 −2.47 3.56 × 10−3
internalin A 0.39 0.10 −0.30 −2.62 3.56 × 10−3
cellobiose transport system permease protein 0.36 0.07 −0.29 −2.85 3.56 × 10−3
alpha-glucuronidase [EC:3.2.1.139] 0.38 0.10 −0.29 −2.55 3.56 × 10−3
MFS transporter, DHA3 family, macrolide efflux protein 0.38 0.10 −0.29 −2.55 3.56 × 10−3
oligosaccharide reducing-end xylanase [EC:3.2.1.156] 0.38 0.10 −0.29 −2.55 3.56 × 10−3
HTH-type transcriptional regulator, SHP3-responsive repressor 0.40 0.12 −0.29 −2.33 3.56 × 10−3
accessory secretory protein Asp1 0.23 0.00 −0.23 −4.66 3.56 × 10−3
accessory secretory protein Asp3 0.23 0.00 −0.23 −4.66 3.56 × 10−3
diguanylate cyclase [EC:2.7.7.65] 0.24 0.02 −0.21 −3.68 3.56 × 10−3

7.2.2 Functional Ordination

PCA of KEGG annotations:

kegg_counts <- genome_annotations %>%
  filter(!is.na(kegg)) %>%          
  dplyr::count(genome, kegg) %>%           
  pivot_wider(
    names_from = kegg,
    values_from = n,
    values_fill = 0
  )

# Normalization
kegg_rel <- kegg_counts %>%
  column_to_rownames("genome")
kegg_rel <- kegg_rel / rowSums(kegg_rel)  # Each row sums to 1

# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]

# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)

# Check variance explained
summary(pca)$importance[,1:5]
                            PC1      PC2      PC3      PC4      PC5
Standard deviation     19.10047 11.37402 8.655536 7.950973 7.575608
Proportion of Variance  0.16977  0.06020 0.034860 0.029420 0.026710
Cumulative Proportion   0.16977  0.22997 0.264830 0.294250 0.320950
country_palette <- c(
  # Southern Europe
  "Spain" = "#1B9E77",
  "Italy" = "#33A02C",
  "Greece" = "#66C2A5",
  "Portugal" = "#2CA25F",
  "Malta" = "#99D8C9",

  # Northern/Central Europe
  "Germany" = "#1F78B4",
  "United Kingdom" = "#4A90E2",
  "Ireland" = "#6BAED6",

  # East Asia 
  "Japan" = "#E31A1C",
  "South Korea" = "#FB6A4A",
  "China" = "#CB181D",

  # North America 
  "USA" = "#756BB1",
  "Canada" = "#9E9AC8",

  # Distinct
  "Australia" = "#FFD92F",   
  "Greenland" = "#A6CEE3",

  "none" = "grey70"
)
scores <- as.data.frame(pca$x)
scores$ID <- rownames(scores)

scores <- scores %>%
  left_join(genome_metadata, by = "ID")

ggplot(scores, aes(PC1, PC2, color = host_type, label = ID)) +
  geom_point(size = 2) +
   scale_color_manual(values = host_type_colors)+
  theme_minimal() +
  labs(
    title = "PCA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  ) + geom_text_repel()
Warning: ggrepel: 139 unlabeled data points (too many overlaps). Consider increasing max.overlaps

##PCA again without outlier
kegg_counts <- genome_annotations %>%
  filter(genome != "EHM071389") %>%
  filter(!is.na(kegg)) %>%          
  dplyr::count(genome, kegg) %>%           
  pivot_wider(
    names_from = kegg,
    values_from = n,
    values_fill = 0
  )

# Normalization
kegg_rel <- kegg_counts %>%
  column_to_rownames("genome")
kegg_rel <- kegg_rel / rowSums(kegg_rel)  # Each row sums to 1

# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]

# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)

# Check variance explained
summary(pca)$importance[,1:5]
                            PC1      PC2      PC3      PC4      PC5
Standard deviation     19.54513 8.762091 8.060842 7.660639 7.307655
Proportion of Variance  0.18384 0.036950 0.031270 0.028240 0.025700
Cumulative Proportion   0.18384 0.220780 0.252050 0.280290 0.305990
scores_filtered <- scores %>% filter(ID != "EHM071389")

ggplot(scores_filtered, aes(PC1, PC2, color = host_type)) +
  geom_point(size = 2) +
   scale_color_manual(values = host_type_colors)+
  theme_minimal() +
  labs(
    title = "PCA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  ) 

ggplot(scores_filtered, aes(PC2, PC3, color = host_type)) +
   scale_color_manual(values = host_type_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "P2 vs PC3 of KEGG annotations across MAGs",
    x = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)"),
    y = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC3, PC4, color = host_type)) +
   scale_color_manual(values = host_type_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "P3 vs PC4 of KEGG annotations across MAGs",
    x = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)"),
    y = paste0("PC4 (", round(summary(pca)$importance[2,4] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by genome size",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by completeness",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC1, PC2, color = host_order)) +
  scale_color_manual(values = host_order_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by host order ",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC1, PC2, color = country_normalized)) +
  geom_point(size = 2) +
  theme_minimal() +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  labs(
    title = "KEGG PCA colored by country ",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores_filtered, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by continent",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

loadings <- pca$rotation

abs_loadings <- apply(abs(loadings[, 1:2]), 1, sum)
top_combined <- sort(abs_loadings, decreasing = TRUE)[1:10]
top_combined
    K01811     K21712     K20485     K25670     K00390     K02049     K15269     K06123     K21713     K25089 
0.09747308 0.09689785 0.09352428 0.09181942 0.09112710 0.09036799 0.09021472 0.09020442 0.08818284 0.08645722 
library(KEGGREST)

top_loadings <- loadings %>%
  as.data.frame() %>%
  rownames_to_column("KEGG") %>%
  arrange(desc(abs(PC1))) 

head(top_loadings)
    KEGG         PC1          PC2        PC3         PC4          PC5          PC6          PC7          PC8
1 K00079 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
2 K00036 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
3 K00133 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
4 K00215 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
5 K00243 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
6 K00567 -0.05079409 -0.009946912 0.00248532 0.003276063 -0.001132451 -0.002808841 -0.003852092 -0.002341786
           PC9         PC10         PC11         PC12         PC13          PC14         PC15        PC16
1 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
2 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
3 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
4 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
5 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
6 0.0004521665 -0.002392353 -0.002480668 0.0009913822 -0.003778335 -0.0006078069 -0.002528725 0.001592632
          PC17        PC18          PC19         PC20         PC21         PC22        PC23         PC24
1 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
2 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
3 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
4 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
5 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
6 -0.003360407 0.001340981 -0.0002272723 -0.003768608 -0.002291973 9.500263e-05 0.002216337 -0.004224191
           PC25         PC26        PC27         PC28          PC29         PC30        PC31         PC32
1 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
2 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
3 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
4 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
5 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
6 -0.0001631202 0.0002876036 0.003178559 -0.003037596 -0.0001121851 0.0009708875 -0.00193514 0.0004799773
        PC33          PC34        PC35        PC36         PC37          PC38        PC39         PC40
1 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
2 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
3 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
4 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
5 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
6 0.00219996 -0.0003941481 0.000202696 0.001647772 0.0001366408 -0.0007984333 0.001261048 0.0009959284
        PC41         PC42        PC43         PC44         PC45        PC46         PC47         PC48
1 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
2 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
3 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
4 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
5 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
6 0.00029952 -0.002102056 0.000196593 -0.001264468 0.0006562265 0.001189624 -0.001023567 0.0006217118
         PC49          PC50         PC51         PC52        PC53         PC54        PC55          PC56
1 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
2 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
3 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
4 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
5 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
6 -0.00137164 -0.0007412628 0.0001437467 0.0009806099 0.001945774 0.0001546252 0.001761751 -0.0005399553
           PC57          PC58         PC59         PC60         PC61        PC62        PC63         PC64
1 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
2 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
3 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
4 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
5 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
6 -0.0006939236 -0.0008296102 0.0007104475 0.0005156834 5.873391e-05 0.002228717 0.001405275 -0.003018754
          PC65        PC66         PC67         PC68          PC69         PC70          PC71         PC72
1 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
2 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
3 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
4 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
5 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
6 -0.001450016 0.001010715 0.0007081953 9.925565e-05 -0.0003884143 -0.001329291 -0.0005705215 0.0001924911
          PC73          PC74        PC75         PC76          PC77         PC78        PC79         PC80
1 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
2 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
3 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
4 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
5 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
6 0.0009102062 -0.0008839761 0.003878626 0.0005542666 -0.0004380038 0.0005507793 0.001078171 0.0005159429
           PC81       PC82         PC83         PC84        PC85         PC86          PC87         PC88
1 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
2 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
3 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
4 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
5 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
6 -0.0004333185 0.00128074 0.0006903886 -0.001080251 0.001396304 -0.001135049 -0.0001929594 5.368472e-05
           PC89        PC90         PC91         PC92         PC93         PC94        PC95         PC96
1 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
2 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
3 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
4 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
5 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
6 -0.0008440112 0.001382843 0.0001964582 0.0004420025 3.101101e-05 0.0008610302 0.000197127 -0.000367249
        PC97          PC98        PC99         PC100         PC101        PC102        PC103        PC104
1 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
2 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
3 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
4 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
5 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
6 0.00019808 -0.0008727388 0.001128832 -0.0003967602 -0.0007866373 0.0006909247 0.0003198017 0.0002666803
          PC105        PC106        PC107        PC108         PC109         PC110        PC111         PC112
1 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
2 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
3 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
4 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
5 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
6 -0.0005473746 0.0002778072 0.0009608736 0.0003987572 -0.0001615991 -0.0006887918 0.0001626152 -0.0004177167
         PC113         PC114       PC115        PC116        PC117         PC118         PC119        PC120
1 1.899141e-05 -0.0008511757  0.06010329 -0.002962319  0.016043795 -8.409942e-03  0.0037907507  0.000645923
2 1.899141e-05 -0.0008511757 -0.20640265  0.020078124 -0.015956337 -1.380522e-03  0.0065767892 -0.005709369
3 1.899141e-05 -0.0008511757 -0.07341287  0.015173034 -0.005053505 -3.825393e-04  0.0034825175  0.007271212
4 1.899141e-05 -0.0008511757 -0.20753487  0.022067260 -0.006439269 -2.867193e-03  0.0051834304  0.008300492
5 1.899141e-05 -0.0008511757  0.02580746 -0.009882745  0.006425397 -7.474454e-03  0.0025719783 -0.002264049
6 1.899141e-05 -0.0008511757  0.02770028  0.001502385  0.003978823  1.216774e-05 -0.0002001145 -0.001348026
          PC121         PC122        PC123         PC124        PC125         PC126        PC127        PC128
1  0.0090693245  0.0028720788 -0.009482343  0.0020125342  0.003163279  7.754879e-03 -0.001428513 -0.013186625
2 -0.0471299972 -0.0008244673 -0.002573394  0.0042153069  0.019226739  9.021912e-05  0.016527766  0.038475110
3 -0.0174921198  0.0027971131 -0.006183392  0.0056262818  0.002260204 -7.543715e-04  0.009455457  0.018688058
4 -0.0383829844 -0.0059104017 -0.004231240  0.0070146526  0.022775278  7.154643e-03  0.020359758  0.039121884
5 -0.0009289484 -0.0047524705 -0.003141514 -0.0020352140  0.006428257  6.725003e-03 -0.003371201 -0.003652185
6  0.0049073681  0.0005776272 -0.006805929  0.0009515351 -0.005996603  5.281924e-03 -0.002810729 -0.004653305
          PC129         PC130         PC131         PC132        PC133        PC134        PC135       PC136
1  0.0006466979 -7.079282e-05 -4.975959e-03  0.0001613765 -0.001505083 -0.020088846 -0.003996740  0.01430148
2 -0.0185976919  4.344553e-03  2.780033e-02  0.0101618662  0.017873468  0.008430215 -0.017623681 -0.01308025
3  0.0139939392  6.666773e-03 -6.742321e-05 -0.0034306545  0.022551929 -0.020373142  0.009075277  0.02991347
4  0.0071775061 -9.405859e-04  2.100842e-02  0.0449713216  0.047790991 -0.038150897  0.003659293  0.03408021
5 -0.0128980913 -3.651087e-02  1.142903e-02  0.0383076302 -0.056153130  0.070196385 -0.038312636 -0.06343444
6  0.0163624973 -9.080050e-03  3.849744e-02  0.0041563346  0.047051444 -0.071112250  0.791279027 -0.58917909
         PC137         PC138        PC139
1  0.005103517  0.0107920497 -0.005259204
2  0.013994051 -0.0083018677 -0.003754833
3 -0.014973659  0.0002958576 -0.005487010
4 -0.066239694 -0.0059026199 -0.012544125
5  0.064759595 -0.0860226940  0.002026684
6 -0.089437226 -0.0002017451  0.036715505
top_kos <- head(top_loadings)%>%pull(KEGG)

ko_pathways <- lapply(top_kos, function(k) {
  info <- keggGet(paste0("ko:", k))[[1]]
  pathways <- info$PATHWAY
  if(is.null(pathways)) pathways <- NA
  return(pathways)
})

names(ko_pathways) <- top_kos
ko_pathways
$K00079
                                           map00590                                            map00790 
                      "Arachidonic acid metabolism"                               "Folate biosynthesis" 
                                           map00980                                            map01100 
     "Metabolism of xenobiotics by cytochrome P450"                                "Metabolic pathways" 
                                           map05204                                            map05208 
            "Chemical carcinogenesis - DNA adducts" "Chemical carcinogenesis - reactive oxygen species" 

$K00036
                                      map00030                                       map00480 
                   "Pentose phosphate pathway"                       "Glutathione metabolism" 
                                      map01100                                       map01110 
                          "Metabolic pathways"        "Biosynthesis of secondary metabolites" 
                                      map01120                                       map01200 
"Microbial metabolism in diverse environments"                            "Carbon metabolism" 
                                      map05230                                       map05415 
         "Central carbon metabolism in cancer"                      "Diabetic cardiomyopathy" 

$K00133
                                      map00260                                       map00261 
    "Glycine, serine and threonine metabolism"                      "Monobactam biosynthesis" 
                                      map00270                                       map00300 
          "Cysteine and methionine metabolism"                          "Lysine biosynthesis" 
                                      map01100                                       map01110 
                          "Metabolic pathways"        "Biosynthesis of secondary metabolites" 
                                      map01120                                       map01210 
"Microbial metabolism in diverse environments"              "2-Oxocarboxylic acid metabolism" 
                                      map01230 
                 "Biosynthesis of amino acids" 

$K00215
                                      map00261                                       map00300 
                     "Monobactam biosynthesis"                          "Lysine biosynthesis" 
                                      map01100                                       map01110 
                          "Metabolic pathways"        "Biosynthesis of secondary metabolites" 
                                      map01120                                       map01230 
"Microbial metabolism in diverse environments"                  "Biosynthesis of amino acids" 

$K00243
[1] NA

$K00567
[1] NA

7.2.3 Testing the differences in KEGG presence/absence with PERMANOVA

kegg_counts_mat <- kegg_counts %>%
  column_to_rownames("genome")

kegg_pa <- (kegg_counts_mat > 0) * 1


# remove zero-variance KOs
kegg_pa_nz <- kegg_pa[, colSums(kegg_pa) > 0 & colSums(kegg_pa) < nrow(kegg_pa)]

# Determine the common genomes
common_ids <- base::intersect(rownames(kegg_pa_nz), genome_metadata$ID)

# Report what will be kept/dropped
message("# common: ", length(common_ids))
# common: 49
message("# in KEGG only: ", length(setdiff(rownames(kegg_pa_nz), genome_metadata$ID)))
# in KEGG only: 90
message("# in metadata only: ", length(setdiff(genome_metadata$ID, rownames(kegg_pa_nz))))
# in metadata only: 78
# Subset to the intersection (and keep order identical)
kegg_pa_nz <- kegg_pa_nz[common_ids, , drop = FALSE]

meta <- genome_metadata %>%
  dplyr::filter(ID %in% common_ids) %>%
  dplyr::distinct(ID, .keep_all = TRUE) %>%
  tibble::column_to_rownames("ID") %>%
  .[common_ids, , drop = FALSE]

stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))

# Prepare variables for PERMANOVA
required_vars <- c("genome_size", "completeness", "host_type")

# Coerce types as needed
meta <- meta %>%
  dplyr::mutate(
    genome_size  = as.numeric(genome_size),
    completeness = as.numeric(completeness),
    host_type    = as.factor(host_type)
  )

# Align on complete cases (adonis2 drops NAs otherwise)
ok <- stats::complete.cases(meta[, required_vars, drop = FALSE])
kegg_pa_nz <- kegg_pa_nz[ok, , drop = FALSE]
meta       <- meta[ok, , drop = FALSE]
stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))

# Distance, dispersion, PERMANOVA
kegg_dist_pa <- vegan::vegdist(kegg_pa_nz, method = "jaccard", binary = TRUE)

disp_pa <- vegan::betadisper(kegg_dist_pa, meta$host_type)
anova(disp_pa)
Analysis of Variance Table

Response: Distances
          Df   Sum Sq    Mean Sq F value Pr(>F)
Groups     1 0.000448 0.00044791  0.3263 0.5706
Residuals 46 0.063141 0.00137264               
vegan::adonis2(
  kegg_dist_pa ~ genome_size + completeness + host_type,
  data = meta,
  permutations = 999,
  by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

vegan::adonis2(formula = kegg_dist_pa ~ genome_size + completeness + host_type, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)    
genome_size   1  0.06534 0.05533 2.8009  0.001 ***
completeness  1  0.02762 0.02339 1.1841  0.238    
host_type     1  0.05889 0.04988 2.5247  0.005 ** 
Residual     44  1.02639 0.86923                  
Total        47  1.18081 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_pa <- cmdscale(kegg_dist_pa, eig = TRUE, k = 2)

variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)

pcoa_df <- data.frame(
  ID = rownames(kegg_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  host_type = meta$host_type
) %>% left_join(genome_metadata, by = "ID")

pcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_type_colors, name = "Host type")+
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host type)",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_kegg_pa

pcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = host_order)) +
  geom_point(size = 2) +
  scale_color_manual(values = host_order_colors, name = "Host order")+
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by host order)",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

pcoa_kegg_pa

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by completeness)",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG presence/absence matrix across MAGs (colored by continent)",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

library(patchwork)
p_main <- ggplot(pcoa_df, aes(PC1, PC2, color = host_type.x)) +
  scale_color_manual(values = host_type_colors) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  ) +
  theme(
    legend.position = "right",
    axis.title.x = element_blank(),  
    axis.text.x = element_blank(),   
    axis.ticks.x = element_blank(),  
    plot.margin = margin(5, 5, 0, 5)
  )

cor_value <- cor(pcoa_df$PC1, pcoa_df$completeness, method = "spearman")
cor_pvalue <- cor.test(pcoa_df$PC1, pcoa_df$completeness, 
                       method = "spearman")$p.value
Warning in cor.test.default(pcoa_df$PC1, pcoa_df$completeness, method = "spearman"): Cannot compute exact
p-value with ties
cor_text <- paste0("Spearman ρ = ", round(cor_value, 3), 
                  "\np < ", format.pval(cor_pvalue, digits = 2))

p_completeness_annotated <- ggplot(pcoa_df, aes(x = PC1, y = completeness, 
                                                 color = host_type.x)) +
  scale_color_manual(values = host_type_colors) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
  annotate("text", 
           x = min(pcoa_df$PC1) + 0.1 * diff(range(pcoa_df$PC1)), 
           y = max(pcoa_df$completeness) - 5,
           label = cor_text, 
           hjust = 0, 
           size = 3,  # Changed from 4 to 3 (smaller)
           fontface = "plain") +  # Changed from "bold" to "plain"
  theme_minimal() +
  labs(
    y = "Completeness (%)",
    x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)")
  ) +
  theme(
    legend.position = "none",
    plot.margin = margin(0, 5, 5, 5),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)  # Add border
  )


p_main <- p_main + 
  theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))

combined_annotated <- p_main / p_completeness_annotated + 
  plot_layout(heights = c(3, 1), guides = "collect")

print(combined_annotated)
`geom_smooth()` using formula = 'y ~ x'

7.3 GIFTs

7.3.1 PCoA with Euclidean distances:

# Convert the GIFTs into a matrix of functional elements per genome (row)
gift_pcoa <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="euclidean") %>%
    pcoa() #principal component analysis

#Extract eigenvalues (variance explained by first 10 axes)
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
  as.data.frame() %>% 
 dplyr::select(Axis.1,Axis.2) # keep the first 2 axes


gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]


#For the black arrows: Functional group loadings
# covariance between each functional trait and pcoa axis scores
#scale with the eigenvectors
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%
  diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  dplyr::rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% #grouping by function
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  dplyr::rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=host_type_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color = host_type), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = completeness), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>%
  rownames_to_column(var="ID") %>%
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = continent),
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts,
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"),
                    type = "open",
                    angle = 25),
                    linewidth = 0.5,
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) +
    ylim(-1,1.5) +
    theme_minimal() +

  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = host_class), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = host_order), 
                 alpha=0.9, shape=16) +
      scale_color_manual(values = host_order_colors)+
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 3 rows containing missing values or values outside the scale range (`geom_point()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider increasing max.overlaps

centroids <- gift_pcoa_vectors %>%
  rownames_to_column(var = "ID") %>%
  left_join(genome_metadata, by = "ID") %>%
  group_by(host_type) %>%
  summarise(
    Axis.1 = mean(Axis.1, na.rm = TRUE),
    Axis.2 = mean(Axis.2, na.rm = TRUE),
    .groups = "drop"
  )

gift_pcoa_vectors %>% 
  rownames_to_column(var = "ID") %>% 
  left_join(genome_metadata, by = "ID") %>%
  ggplot(aes(x = Axis.1, y = Axis.2, color = host_type)) +
  geom_point(size = 3, alpha = 0.9) +
  scale_color_manual(values = host_type_colors) +

  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1] * 100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2] * 100, 2), " %)")
  ) +

  coord_cartesian(xlim = c(-0.6, 1), ylim = c(-1, 1)) +

  theme_minimal(base_size = 16) +
  theme(
    axis.title = element_text(size = 14, face = "bold"),
    axis.text  = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text  = element_text(size = 14),
    panel.grid.minor = element_blank()
  )

Using k-means to cluster the groups and check which MAGs cluster together:

coords <- gift_pcoa_vectors %>%
  rownames_to_column("MAG") %>%
  as_tibble()

set.seed(123)
km <- kmeans(coords[, c("Axis.1", "Axis.2")], centers = 2, nstart = 25, iter.max = 100)

coords <- coords %>% mutate(cluster = factor(km$cluster))
# centroids as a tibble for plotting
centroids <- as_tibble(km$centers) %>% mutate(cluster = factor(1:nrow(km$centers)))

ggplot(coords, aes(x = Axis.1, y = Axis.2, color = cluster)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_point(data = centroids, aes(x = Axis.1, y = Axis.2, color = cluster),
             shape = 4, size = 5, stroke = 1.25) +   # X marks centroids
  theme_minimal() +
  labs(title = paste0("PCoA colored by kmeans (k=2)"),
       color = "cluster")

mags_by_cluster <- split(coords$MAG, km$cluster)
mags_by_cluster
$`1`
 [1] "EHM066683"       "EHM045744"       "GCF_027676165.1" "GCA_024461935.1" "GCA_948850205.1" "GCA_015667075.1"
 [7] "EHM057831"       "GCA_015551225.1" "GCF_027676135.1" "GCA_039060945.1" "GCF_027672785.1" "GCF_050408585.1"
[13] "GCF_039061015.1" "GCF_027673305.1" "GCF_015667075.1" "GCF_027673155.1" "GCA_949462505.1" "GCF_027673085.1"
[19] "GCA_949470665.1" "GCF_027672695.1" "EHM066565"       "GCA_948496745.1" "EHM079037"       "GCF_027679985.1"
[25] "EHM019886"       "GCA_949840595.1" "GCF_015551225.1" "GCF_039060945.1" "GCA_948619675.1" "GCA_050408585.1"
[31] "GCA_948514765.1" "GCF_027672665.1" "GCF_027672965.1" "GCF_024461915.1" "GCF_902364565.1" "GCF_003472365.1"
[37] "GCA_039061015.1" "GCA_024461915.1" "GCA_003472365.1" "GCA_902364565.1" "GCF_027672865.1" "EHM071389"      
[43] "GCF_027673485.1" "GCF_027673225.1" "GCF_024461935.1" "EHM028422"      

$`2`
 [1] "EHM042508"       "GCF_030545485.1" "EHM013277"       "GCA_948644855.1" "GCF_014830215.1" "GCA_949460315.1"
 [7] "GCA_948622745.1" "EHM045766"       "GCA_958421465.1" "GCA_947347755.1" "GCF_027674325.1" "GCA_015667015.1"
[13] "EHM027656"       "EHM041731"       "GCA_018916825.1" "GCA_025128595.1" "GCA_948896515.1" "EHM056720"      
[19] "GCF_025128595.1" "GCF_027674685.1" "EHM043079"       "GCF_027672725.1" "EHM031001"       "GCA_015666905.1"
[25] "GCF_027676035.1" "EHM046444"       "EHM045845"       "EHM056528"       "EHM031663"       "GCA_937924215.1"
[31] "EHM028346"       "GCA_948672875.1" "EHM025679"       "GCF_014334715.1" "GCF_009497135.1" "GCA_001672265.1"
[37] "GCF_021961665.1" "EHM041306"       "GCA_020782925.1" "GCF_020782925.1" "GCA_018381915.1" "GCF_027674405.1"
[43] "GCA_020553675.1" "GCF_030480465.1" "GCA_030545485.1" "GCA_949028585.1" "GCF_001672265.1" "GCA_040516025.1"
[49] "GCF_015666905.1" "EHM031160"       "GCF_027673065.1" "GCA_014334715.1" "EHM031128"       "GCA_014830215.1"
[55] "GCA_948654415.1" "EHM034137"       "GCA_030480465.1" "EHM040253"       "GCF_015552555.1" "GCA_948544615.1"
[61] "GCA_948700895.1" "GCA_949428005.1" "EHM046343"       "EHM057270"       "GCA_949439745.1" "GCF_018916825.1"
[67] "GCA_948674145.1" "GCA_949006045.1" "GCF_027672645.1" "GCF_932750895.1" "GCA_009497135.1" "GCA_949444485.1"
[73] "GCF_040516025.1" "GCA_905210045.1" "EHM044953"       "GCF_020553675.1" "GCF_027688475.1" "GCF_027661125.1"
[79] "GCF_015667015.1" "EHM056391"       "GCA_948748415.1" "GCA_015552555.1" "EHM025059"       "GCA_022794425.1"
[85] "GCA_948995465.1" "EHM016594"       "EHM020035"       "GCA_948741555.1" "GCF_027674205.1" "EHM015868"      
[91] "EHM017294"       "GCA_948777995.1" "EHM028354"       "EHM028200"      

7.3.2 Checking each cluster at once

cluster1_MAGs <- mags_by_cluster[[1]]
genome_metadata %>% filter(ID %in% cluster1_MAGs)
# A tibble: 13 × 45
   ID        source species gtdb_taxonomy host_species host_order host_class isolation_source host  host_summary
   <chr>     <chr>  <chr>   <chr>         <chr>        <chr>      <chr>      <chr>            <chr> <lgl>       
 1 EHM019886 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 2 EHM028422 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 3 EHM045744 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 4 EHM057831 EHI    s__Lac… <NA>          Myotis Bech… Chiroptera Mammalia   host-associated  <NA>  NA          
 5 EHM066565 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 6 EHM066683 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 7 EHM071389 EHI    s__Lac… <NA>          Podarcis Fi… Squamata   Reptilia   host-associated  <NA>  NA          
 8 EHM079037 EHI    s__Lac… <NA>          Myotis Bech… Chiroptera Mammalia   host-associated  <NA>  NA          
 9 GCF_0155… NCBI   <NA>    <NA>          Homo sapiens Primates   Mammalia   stool            Homo… NA          
10 GCF_0156… NCBI   <NA>    <NA>          Homo sapiens Primates   Mammalia   stool            Homo… NA          
11 GCF_0390… NCBI   <NA>    <NA>          Homo sapiens Primates   Mammalia   feces            Homo… NA          
12 GCF_0390… NCBI   <NA>    <NA>          Homo sapiens Primates   Mammalia   feces            Homo… NA          
13 GCF_0504… NCBI   <NA>    <NA>          Actinoptery… <NA>       Actinopte… intestine        fish  NA          
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
#   genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
#   mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
#   env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
#   disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>,
#   host_age <lgl>, host_type <fct>, country_simple <chr>, country_normalized <chr>, continent <chr>
cluster2_MAGs <- mags_by_cluster[[2]]
genome_metadata %>% filter(ID %in% cluster2_MAGs)
# A tibble: 37 × 45
   ID        source species gtdb_taxonomy host_species host_order host_class isolation_source host  host_summary
   <chr>     <chr>  <chr>   <chr>         <chr>        <chr>      <chr>      <chr>            <chr> <lgl>       
 1 EHM013277 EHI    s__Lac… <NA>          Podarcis Mu… Squamata   Reptilia   host-associated  <NA>  NA          
 2 EHM015868 EHI    s__Lac… <NA>          Podarcis Mu… Squamata   Reptilia   host-associated  <NA>  NA          
 3 EHM016594 EHI    s__Lac… <NA>          Hipposidero… Chiroptera Mammalia   host-associated  <NA>  NA          
 4 EHM017294 EHI    s__Lac… <NA>          Rhinolophus… Chiroptera Mammalia   host-associated  <NA>  NA          
 5 EHM020035 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
 6 EHM025059 EHI    s__Lac… <NA>          Podarcis Mu… Squamata   Reptilia   host-associated  <NA>  NA          
 7 EHM025679 EHI    s__Lac… <NA>          Podarcis Mu… Squamata   Reptilia   host-associated  <NA>  NA          
 8 EHM027656 EHI    s__Lac… <NA>          Cathartes A… Accipitri… Aves       host-associated  <NA>  NA          
 9 EHM028200 EHI    s__Lac… <NA>          Cathartes A… Accipitri… Aves       host-associated  <NA>  NA          
10 EHM028346 EHI    s__Lac… <NA>          Plecotus Au… Chiroptera Mammalia   host-associated  <NA>  NA          
# ℹ 27 more rows
# ℹ 35 more variables: country <chr>, locality <chr>, completeness <dbl>, contamination <dbl>,
#   genome_size <dbl>, GC <dbl>, N50 <dbl>, contigs <dbl>, collection_date <chr>, ncbi_biosample <chr>,
#   mag_name <chr>, eha_number <chr>, gtdb_representative <lgl>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, common_name <chr>, sample_name <chr>, external_id <chr>, submitter_id <chr>,
#   env_broad <chr>, env_medium <chr>, env_local <chr>, env_local_context <chr>, host_status <lgl>,
#   disease <lgl>, diagnosis <lgl>, latitude <dbl>, longitude <dbl>, accession <chr>, host_disease <lgl>, …
genome_metadata <- genome_metadata %>%
  dplyr::mutate(
    ID_clean = ID %>%
      basename() %>%
      # remove file extensions
      sub("\\.(fna|fa|fasta)(\\.gz)?$", "", ., ignore.case = TRUE) %>%
      # remove GB_ or RS_ at the beginning
      sub("^(GB_|RS_)", "", ., ignore.case = TRUE) %>%
      # extract the real GCA/GCF accession (robust)
      sub(".*\\b(GC[AF]_[0-9]+\\.[0-9]+).*", "\\1", ., perl = TRUE)
  )
metadata_with_cluster <- genome_metadata %>%
  left_join(coords %>% dplyr::select(MAG, cluster), by = c("ID_clean" = "MAG"))

metadata_with_cluster %>%
  group_by(cluster, host_type) %>%
  summarise(n = n(), .groups = "drop") %>%
  arrange(cluster, desc(n))
# A tibble: 7 × 3
  cluster host_type     n
  <fct>   <fct>     <int>
1 1       human        21
2 1       animal       16
3 2       animal       58
4 2       human        20
5 2       <NA>          1
6 <NA>    animal       10
7 <NA>    human         1
# Group summaries
metadata_with_cluster %>%
  group_by(cluster) %>%
  summarise(mean_genome_size = mean(genome_size, na.rm = TRUE),
            median_contamination = median(contamination, na.rm = TRUE),
            .groups = "drop")
# A tibble: 3 × 3
  cluster mean_genome_size median_contamination
  <fct>              <dbl>                <dbl>
1 1               2392085.                 0.52
2 2               2415231.                 0.35
3 <NA>            2367492.                 0.52
# save(ehi_mags,
#      phylum_colors,
#      genome_annotations,
#      genome_gifts,
#      contig_to_genome,
#      gtdb_metadata,
#      ehi_metadata,
#      master_index,
#      genome_metadata,
#      host_type_colors,
#      getphylo_tree,
#      metadata_with_cluster,
#      file = "data/data.Rdata")

Is the genome size different between clusters?

kruskal.test(genome_size ~ cluster, data = metadata_with_cluster)

    Kruskal-Wallis rank sum test

data:  genome_size by cluster
Kruskal-Wallis chi-squared = 0.079169, df = 1, p-value = 0.7784
pairwise.wilcox.test(
  x = metadata_with_cluster$genome_size,
  g = metadata_with_cluster$cluster,
  p.adjust.method = "BH"   # FDR correction
)

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  metadata_with_cluster$genome_size and metadata_with_cluster$cluster 

  1   
2 0.78

P value adjustment method: BH 

Plots

ggplot(metadata_with_cluster, aes(x = genome_size, y = GC, col = cluster))+
  geom_point()+
  theme_bw()

ggplot(metadata_with_cluster, aes(x = cluster, y = genome_size, col = cluster))+
  geom_point()+
  theme_bw()

count_table <- metadata_with_cluster %>%
  filter(!is.na(cluster))%>%
  dplyr::count(cluster, host_type) %>%
  pivot_wider(
    names_from = host_type,
    values_from = n,
    values_fill = 0
  )

count_table
# A tibble: 2 × 4
  cluster animal human  `NA`
  <fct>    <int> <int> <int>
1 1           16    21     0
2 2           58    20     1
mat <- metadata_with_cluster %>%
  filter(!is.na(cluster))%>%
  dplyr::count(cluster, host_type) %>%
  pivot_wider(names_from = host_type, values_from = n, values_fill = 0) %>%
  column_to_rownames("cluster") %>%
  as.matrix()

chisq.test(mat)$expected
Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect
    animal    human        NA
1 23.60345 13.07759 0.3189655
2 50.39655 27.92241 0.6810345
fisher.test(mat)

    Fisher's Exact Test for Count Data

data:  mat
p-value = 0.002253
alternative hypothesis: two.sided
#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)

#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements,GIFT_db)

#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)

7.3.3 GIFT community plots

GIFTs_elements %>%
    as.data.frame() %>%
    rownames_to_column(var="MAG")%>%
    pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
    left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
    mutate(functionid = substr(trait, 1, 3)) %>%
    mutate(trait = case_when(
      trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
      TRUE ~ trait
    )) %>%
    mutate(functionid = case_when(
      functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
      TRUE ~ functionid
    )) %>%
    mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
    mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
    ggplot(aes(x=MAG,y=trait,fill=gift)) +
        geom_tile(colour="white", linewidth=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(functionid ~ cluster, scales="free",space="free") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              axis.text.y = element_text(size=6),
              strip.text.y = element_text(angle = 0)
              ) +
        labs(y="Traits",x="Samples",fill="GIFT")

p <- GIFTs_elements %>%
    as.data.frame() %>%
    rownames_to_column(var="MAG")%>%
    pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
    left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
    mutate(functionid = substr(trait, 1, 3)) %>%
    mutate(trait = case_when(
      trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
      TRUE ~ trait
    )) %>%
    mutate(functionid = case_when(
      functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
      TRUE ~ functionid
    )) %>%
    mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
    mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
    ggplot(aes(x=MAG,y=trait,fill=gift)) +
        geom_tile(colour="white", linewidth=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(functionid ~ host_type, scales="free",space="free") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              axis.text.y = element_text(size=6),
              strip.text.y = element_text(angle = 0)
              ) +
        labs(y="Traits",x="Samples",fill="GIFT")

p

ggplot2::ggsave(
  filename = "plots/GIFT_heatmap_final.png",
  plot = p,
  width = 14,
  height = 16,
  units = "in",
  dpi = 600,
  bg = "white"
)
library(RColorBrewer)
library(reshape2)
Warning: package 'reshape2' was built under R version 4.4.3

Adjuntando el paquete: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(str_starts(Code_element, "D09")) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  # Ensure trait is a factor to maintain order
  mutate(trait = factor(trait, levels = rev(unique(GIFT_db$Element)))) %>%
  ggplot( aes(x = Genome, y = trait)) +
  # Heatmap
  geom_tile(aes(fill = GIFT), color = "white") +
  scale_fill_gradientn(colours = brewer.pal(7, "YlGnBu"), name = "GIFT Score") +
  
  # Start a new fill scale for the host_type bar
  new_scale_fill() +
  
  # Add the host_type bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = host_type), height = 0.5) +
  scale_fill_manual(values = host_type_colors) +
  
  facet_grid(~ cluster, scales = "free_x", space = "free_x") +
  theme_minimal(base_size = 8) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

7.3.4 Checking difference in GIFTs

7.3.4.1 Cluster-wise

Element GIFTs different between clusters of pcoa

# Get only the GIFT columns
gift_cols <- colnames(GIFTs_elements)[!(colnames(GIFTs_elements) %in% c("genome","cluster","host_type"))]

gift_dataframe <- GIFTs_elements %>%
  as.data.frame() %>%            
  rownames_to_column(var = "ID") 

gift_df_meta <- gift_dataframe %>%
  left_join(metadata_with_cluster, by = "ID")


# Kruskal-Wallis for 4 groups
kruskal_results <- sapply(gift_cols, function(g) {
  kruskal.test(as.formula(paste(g, "~ cluster")), data = gift_df_meta)$p.value
})

# Adjust for multiple testing
kruskal_results_adj <- p.adjust(kruskal_results, method = "BH")

# Combine into a table
kruskal_table <- data.frame(
  GIFT = gift_cols,
  p_value = kruskal_results,
  p_adj = kruskal_results_adj
)

kruskal_table %>% filter(p_adj<0.05)
       GIFT      p_value        p_adj
B0601 B0601 5.011737e-04 9.021127e-03
D0908 D0908 2.559625e-12 9.214650e-11
pairwise_results <- lapply(gift_cols, function(g) {
  pairwise.wilcox.test(
    x = gift_df_meta[[g]],
    g = gift_df_meta$cluster,
    p.adjust.method = "BH"
  )
})

names(pairwise_results) <- gift_cols

pairwise_sig_table <- lapply(names(pairwise_results), function(g) {
  
  # Extract p-value matrix
  pmat <- pairwise_results[[g]]$p.value
  
  # Convert to long format
  pmat_long <- as.data.frame(as.table(pmat)) %>%
    filter(!is.na(Freq)) %>%       # remove NA (diagonal / upper triangle)
    dplyr::rename(group1 = Var1, group2 = Var2, p_adj = Freq) %>%
    filter(p_adj < 0.05) %>%      
    mutate(GIFT = g)
  
  return(pmat_long)
})



# Combine all GIFTs into one table
pairwise_sig_table <- bind_rows(pairwise_sig_table) %>%
  dplyr::select(GIFT, group1, group2, p_adj)

pairwise_sig_table 
   GIFT group1 group2        p_adj
1 B0601      2      1 5.470079e-04
2 B0704      2      1 2.246680e-02
3 D0507      2      1 1.442942e-02
4 D0908      2      1 2.839505e-12
GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% pairwise_sig_table$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
  new_scale_fill() +
  
  # Add the host_type bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = host_type), height = 0.3) +
  scale_fill_manual(values = host_type_colors, name = "host_type") +
    facet_grid(~ cluster, scales = "free_x", space = "free_x") + 
    theme(
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 4),
      strip.text.x = element_text(face = "bold")
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

GIFT_db%>%
  filter(Code_element %in%  pairwise_sig_table$GIFT)
   Code_bundle Code_element Code_function       Domain                   Function           Element
1      B060101        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
2      B060102        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
3      B060103        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
4      B060104        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
5      B060105        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
6      B060106        B0601           B06 Biosynthesis Organic anion biosynthesis         Succinate
7      B070401        B0704           B07 Biosynthesis       Vitamin biosynthesis Pantothenate (B5)
8      B070402        B0704           B07 Biosynthesis       Vitamin biosynthesis Pantothenate (B5)
9      D050701        D0507           D05  Degradation     Amino acid degradation           Leucine
10     D090801        D0908           D09  Degradation     Antibiotic degradation         Macrolide
                                                                                                                                                                                                                                                                                          Definition
1  (K01647,K05942) (K01681,K01682) (K00031,K00030) ((((K00164+K00658),K01616)+K00382),(K00174+K00175)) ((K01902+K01903),(K01899+K01900),K18118) ((K00234+K00235+K00236+(K00237,K25801)),(K00239+K00240+K00241),(K00244+K00245+K00246)) (K01676,K01679,(K01677+K01678)) (K00026,K00025,K00024,K00116)
2                                                           ((((K00164+K00658),K01616)+K00382),K00174) ((K01902+K01903),(K01899+K01900),K18118) ((K00234+K00235+K00236+(K00237,K25801)),(K00239+K00240+K00241),(K00244+K00245+K00246)) (K01676,K01679,(K01677+K01678)) (K00026,K00025,K00024,K00116)
3                                                                                                                                                                                                                                        K01580 (K13524,K07250,K00823,K16871) (K00135,K00139,K17761)
4                                                                                                                                     (K00169+K00170+K00171+K00172) K01007 K01595 K00024 (K01677+K01678) (K00239+K00240) (K01902+K01903) (K15038,K15017) K14465 (K14467,K18861) K14534 K15016 K00626
5                                                                                                                                       (K02160+K01961+K01962+K01963) K14468 K14469 K15052 K05606 (K01847,(K01848+K01849)) (K14471+K14472) (K00239+K00240+K00241) K01679 K08691 K14449 K14470 K09709
6                                                                                                                                                                 (K00169+K00170+K00171+K00172) (K01959+K01960) K00024 (K01677+K01678) (K18209+K18210) (K01902+K01903) (K00174+K00175+K00176+K00177)
7                                                                                                                                                                                                                                                    ((K00826 K00606 K00077),K01579) (K01918,K13799)
8                                                                                                                                                                                                                                                           ((K00606 K00077),(K13367 K00128)) K01918
9                                                                                                                                                                                             K00826 (((K00166+K00167),K11381)+K09699+K00382) (K00253,K00249) (K01968+K01969) (K05607,K13766) K01640
10                                                                                                                                                                                                                                                                K06979,K08217,K18230,K18231,K21251
library(rstatix)
library(dplyr)
library(purrr)

pairwise_sig_table <- map_df(gift_cols, function(g) {
  
  # Check if there is variation in the GIFT values
  # If all values are the same, skip this GIFT
  if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
  
  # Run the test safely
  tryCatch({
    results <- gift_df_meta %>%
      wilcox_test(as.formula(paste0("`", g, "` ~ cluster")), p.adjust.method = "BH")
    
    eff <- gift_df_meta %>%
      wilcox_effsize(as.formula(paste0("`", g, "` ~ cluster")))
    
    results %>%
      left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
      filter(p.adj < 0.05) %>%
      mutate(GIFT = g)
  }, error = function(e) return(NULL)) # If it still fails, just skip it
})

top_gifts <- pairwise_sig_table %>%
  arrange(desc(effsize))


top_gifts
# A tibble: 3 × 12
  .y.   group1 group2    n1    n2 statistic        p    p.adj p.adj.signif effsize magnitude GIFT 
  <chr> <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>          <dbl> <ord>     <chr>
1 B0601 1      2         13    37      166. 0.000547 0.000547 ***            0.492 moderate  B0601
2 D0507 1      2         13    37      166  0.014    0.014    *              0.348 moderate  D0507
3 B0704 1      2         13    37      192. 0.022    0.022    *              0.326 moderate  B0704
GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% pairwise_sig_table$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
    facet_grid(~ cluster, scales = "free_x", space = "free_x") + 
    theme_grey(base_size = 8) +
    theme(
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      strip.text.x = element_text(face = "bold")
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1121 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

7.3.4.2 host_type-wise

GIFTs that are different between EHI and GTDB

wilcox_results <- sapply(gift_cols, function(g) {
  wilcox.test(as.formula(paste(g, "~ host_type")), data = gift_df_meta)$p.value
})

wilcox_results_adj <- p.adjust(wilcox_results, method = "BH")

data.frame(
  GIFT = gift_cols,
  p_value = wilcox_results,
  p_adj = wilcox_results_adj
)
       GIFT     p_value     p_adj
B0101 B0101 0.553951606 0.7865086
B0102 B0102 0.454067596 0.7865086
B0103 B0103         NaN       NaN
B0104 B0104 0.699118745 0.7865086
B0105 B0105 0.819762622 0.8679840
B0106 B0106 0.699118745 0.7865086
B0204 B0204         NaN       NaN
B0205 B0205 0.699118745 0.7865086
B0206 B0206         NaN       NaN
B0207 B0207 0.699118745 0.7865086
B0208 B0208 0.699118745 0.7865086
B0209 B0209 0.699118745 0.7865086
B0210 B0210 0.699118745 0.7865086
B0211 B0211 0.669187269 0.7865086
B0212 B0212         NaN       NaN
B0213 B0213         NaN       NaN
B0214 B0214         NaN       NaN
B0215 B0215 0.315200758 0.7865086
B0216 B0216         NaN       NaN
B0217 B0217         NaN       NaN
B0218 B0218         NaN       NaN
B0219 B0219 0.699118745 0.7865086
B0220 B0220         NaN       NaN
B0221 B0221         NaN       NaN
B0302 B0302         NaN       NaN
B0303 B0303         NaN       NaN
B0307 B0307 0.699118745 0.7865086
B0309 B0309         NaN       NaN
B0310 B0310         NaN       NaN
B0401 B0401         NaN       NaN
B0402 B0402         NaN       NaN
B0403 B0403         NaN       NaN
B0601 B0601 0.377067975 0.7865086
B0602 B0602         NaN       NaN
B0603 B0603 0.699118745 0.7865086
B0604 B0604         NaN       NaN
B0605 B0605         NaN       NaN
B0701 B0701 0.553951606 0.7865086
B0702 B0702         NaN       NaN
B0703 B0703         NaN       NaN
B0704 B0704 0.376743600 0.7865086
B0705 B0705         NaN       NaN
B0706 B0706         NaN       NaN
B0707 B0707 0.699118745 0.7865086
B0708 B0708         NaN       NaN
B0709 B0709         NaN       NaN
B0710 B0710         NaN       NaN
B0711 B0711         NaN       NaN
B0712 B0712         NaN       NaN
B0801 B0801         NaN       NaN
B0802 B0802         NaN       NaN
B0803 B0803         NaN       NaN
B0804 B0804         NaN       NaN
B0805 B0805         NaN       NaN
B0901 B0901         NaN       NaN
B0902 B0902         NaN       NaN
B0903 B0903         NaN       NaN
B1004 B1004         NaN       NaN
B1006 B1006         NaN       NaN
B1008 B1008         NaN       NaN
B1011 B1011         NaN       NaN
B1012 B1012         NaN       NaN
B1014 B1014         NaN       NaN
B1021 B1021         NaN       NaN
B1022 B1022         NaN       NaN
B1024 B1024         NaN       NaN
B1026 B1026         NaN       NaN
B1028 B1028 0.466154044 0.7865086
B1029 B1029         NaN       NaN
B1041 B1041 0.376743600 0.7865086
B1042 B1042         NaN       NaN
D0101 D0101 0.699118745 0.7865086
D0102 D0102 0.699118745 0.7865086
D0103 D0103         NaN       NaN
D0104 D0104         NaN       NaN
D0201 D0201         NaN       NaN
D0202 D0202         NaN       NaN
D0203 D0203         NaN       NaN
D0204 D0204         NaN       NaN
D0205 D0205         NaN       NaN
D0206 D0206         NaN       NaN
D0207 D0207         NaN       NaN
D0208 D0208         NaN       NaN
D0209 D0209         NaN       NaN
D0210 D0210         NaN       NaN
D0211 D0211         NaN       NaN
D0212 D0212         NaN       NaN
D0213 D0213         NaN       NaN
D0301 D0301         NaN       NaN
D0302 D0302         NaN       NaN
D0303 D0303         NaN       NaN
D0304 D0304         NaN       NaN
D0305 D0305         NaN       NaN
D0306 D0306         NaN       NaN
D0307 D0307         NaN       NaN
D0308 D0308         NaN       NaN
D0309 D0309 1.000000000 1.0000000
D0310 D0310         NaN       NaN
D0401 D0401         NaN       NaN
D0402 D0402         NaN       NaN
D0403 D0403         NaN       NaN
D0404 D0404         NaN       NaN
D0405 D0405         NaN       NaN
D0406 D0406         NaN       NaN
D0407 D0407         NaN       NaN
D0408 D0408         NaN       NaN
D0501 D0501         NaN       NaN
D0502 D0502         NaN       NaN
D0503 D0503         NaN       NaN
D0504 D0504         NaN       NaN
D0505 D0505         NaN       NaN
D0506 D0506         NaN       NaN
D0507 D0507 0.733176720 0.7998291
D0508 D0508 0.006259432 0.2253395
D0509 D0509 0.699118745 0.7865086
D0510 D0510         NaN       NaN
D0511 D0511         NaN       NaN
D0512 D0512         NaN       NaN
D0513 D0513         NaN       NaN
D0516 D0516         NaN       NaN
D0517 D0517         NaN       NaN
D0518 D0518         NaN       NaN
D0601 D0601         NaN       NaN
D0602 D0602         NaN       NaN
D0603 D0603         NaN       NaN
D0604 D0604         NaN       NaN
D0606 D0606         NaN       NaN
D0607 D0607         NaN       NaN
D0608 D0608         NaN       NaN
D0609 D0609         NaN       NaN
D0610 D0610         NaN       NaN
D0611 D0611         NaN       NaN
D0612 D0612         NaN       NaN
D0613 D0613         NaN       NaN
D0701 D0701         NaN       NaN
D0702 D0702         NaN       NaN
D0704 D0704         NaN       NaN
D0705 D0705         NaN       NaN
D0706 D0706         NaN       NaN
D0708 D0708         NaN       NaN
D0709 D0709         NaN       NaN
D0801 D0801         NaN       NaN
D0802 D0802         NaN       NaN
D0804 D0804 0.027200351 0.4896063
D0805 D0805         NaN       NaN
D0806 D0806         NaN       NaN
D0807 D0807 0.393627743 0.7865086
D0808 D0808         NaN       NaN
D0809 D0809         NaN       NaN
D0810 D0810         NaN       NaN
D0811 D0811         NaN       NaN
D0812 D0812         NaN       NaN
D0813 D0813         NaN       NaN
D0814 D0814         NaN       NaN
D0815 D0815         NaN       NaN
D0816 D0816 0.061141985 0.5769099
D0817 D0817         NaN       NaN
D0818 D0818         NaN       NaN
D0819 D0819         NaN       NaN
D0901 D0901         NaN       NaN
D0902 D0902         NaN       NaN
D0903 D0903         NaN       NaN
D0904 D0904         NaN       NaN
D0905 D0905         NaN       NaN
D0906 D0906 0.553951606 0.7865086
D0907 D0907         NaN       NaN
D0908 D0908 0.107578984 0.7745687
D0910 D0910         NaN       NaN
D0911 D0911         NaN       NaN
D0912 D0912         NaN       NaN
S0101 S0101         NaN       NaN
S0103 S0103         NaN       NaN
S0104 S0104         NaN       NaN
S0105 S0105 0.064101099 0.5769099
S0201 S0201 0.699118745 0.7865086
S0202 S0202 1.000000000 1.0000000
S0301 S0301 0.699118745 0.7865086
effect_size <- sapply(gift_cols, function(g) {
  median(gift_df_meta[[g]][gift_df_meta$host_type=="human"]) -
    median(gift_df_meta[[g]][gift_df_meta$host_type=="animal"])
})

wilcox_res_host_type <- data.frame(
  GIFT = gift_cols,
  p_value = wilcox_results,
  p_adj = wilcox_results_adj,
  effect = effect_size
)


wilcox_res_host_type %>% dplyr::select(GIFT, p_adj, effect) %>% filter(p_adj<0.05)
[1] GIFT   p_adj  effect
<0 rows> (o 0- extensión row.names)
GIFT_db%>%
  filter(Code_element %in% c("B0215", "B0701", "D0508") )
   Code_bundle Code_element Code_function       Domain                Function       Element
1      B021501        B0215           B02 Biosynthesis Amino acid biosynthesis     Histidine
2      B070101        B0701           B07 Biosynthesis    Vitamin biosynthesis Thiamine (B1)
3      B070102        B0701           B07 Biosynthesis    Vitamin biosynthesis Thiamine (B1)
4      B070103        B0701           B07 Biosynthesis    Vitamin biosynthesis Thiamine (B1)
5      B070104        B0701           B07 Biosynthesis    Vitamin biosynthesis Thiamine (B1)
6      D050801        D0508           D05  Degradation  Amino acid degradation        Lysine
7      D050802        D0508           D05  Degradation  Amino acid degradation        Lysine
8      D050803        D0508           D05  Degradation  Amino acid degradation        Lysine
9      D050804        D0508           D05  Degradation  Amino acid degradation        Lysine
10     D050805        D0508           D05  Degradation  Amino acid degradation        Lysine
11     D050806        D0508           D05  Degradation  Amino acid degradation        Lysine
                                                                                                                                                 Definition
1  K00765 ((K01523 K01496),K11755,K14152) (K01814,K24017) ((K02501+K02500),K01663) ((K01693 K00817 (K04486,K05602,K18649)),(K01089 K00817)) (K00013,K14152)
2                                                                  (((K03148+K03154) K03151),(K03150 K03149)) K03147 ((K00941 K00788),K14153,K21219) K00946
3                                                                             (((K03148+K03154) K03151),(K03153 K03149 K10810)) K03147 K00941 K00788 K00946
4                                                                                                  (K22699,K03147) ((K00941 (K00788,K21220)),K21219) K00946
5                                                                                                                             (K00941 K00788),K14153,K21219
6                                                                                          4.1.1.18 2.6.1.82 1.2.1.19 2.6.1.48 1.2.1.20 1.14.11.64 1.1.5.13
7                                                                                                             1.13.12.2 3.5.1.30 1.6.1.48 1.2.1.20 2.8.3.13
8                                                                                                                                         2.6.1.36 1.2.1.31
9                                                                                                     4.1.1.18 2.6.1.82 1.2.1.19 2.6.1.48 1.2.1.20 2.8.3.13
10                                                                                                         K01582 K09251 K00137 K07250 K00135 K15737 K15736
11                                     K00468 K01506 (K14268,K07250) K00135 ((K15737 K15736),(K01041 K00252 (K01692,K01825,K01782) (K01825,K01782) K00626))
wilcox_res_host_type <- map_df(gift_cols, function(g) {
  
 
  if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
  
  tryCatch({
    # Calculate p-value
    res <- gift_df_meta %>%
      wilcox_test(as.formula(paste0("`", g, "` ~ host_type")))
    
    # Calculate effect size (Rank-Biserial Correlation)
    eff <- gift_df_meta %>%
      wilcox_effsize(as.formula(paste0("`", g, "` ~ host_type")))
    
    # Combine results
    res %>%
      left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
      mutate(GIFT = g)
  }, error = function(e) return(NULL))
})

#  Adjust P-values and filter
wilcox_res_host_type <- wilcox_res_host_type %>%
  mutate(p_adj = p.adjust(p, method = "BH")) %>%
  dplyr::select(GIFT, group1, group2, p_adj, effsize, magnitude) %>%
  filter(p_adj < 0.05) %>%
  arrange(desc(abs(effsize))) # Sort by strongest effect

# View the top results
print(wilcox_res_host_type)
# A tibble: 0 × 6
# ℹ 6 variables: GIFT <chr>, group1 <chr>, group2 <chr>, p_adj <dbl>, effsize <dbl>, magnitude <ord>
host_class_colors <- c(
  "Mammalia"  = "#8B1E3F",  
  "Reptilia"  = "#3A7D44", 
  "Aves"      = "#E1B12C"   
)

GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% wilcox_res_host_type$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
  new_scale_fill() +
  
  # Add the host_type bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = host_class), height = 0.5) +
    facet_grid(~ host_type, scales = "free_x", space = "free_x") + 
  scale_fill_manual(values = host_class_colors, name = "Host class")
    theme(
      axis.text.x = element_blank()
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")