library(dplyr)

# Read the ORA results
ora_results <- read.csv("/workdir/execution_outputs/kegg_ora_results.csv", stringsAsFactors = FALSE)

# Calculate fold enrichment
ora_results$fold_enrichment <- as.numeric(sapply(strsplit(ora_results$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))) / 
                             as.numeric(sapply(strsplit(ora_results$BgRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2])))

# Identify cancer-related pathways
cancer_keywords <- c("cancer", "carcinoma", "tumor", "oncog", "p53", "apoptosis", "cell cycle", 
                    "DNA repair", "metastasis", "angiogenesis", "PI3K", "mTOR", "Wnt", "Hippo",
                    "TGF", "MAPK", "ErbB", "VEGF", "Ras", "Jak-STAT")

is_cancer_related <- function(description) {
    description_lower <- tolower(description)
    any(sapply(cancer_keywords, function(kw) grepl(tolower(kw), description_lower)))
}

ora_results$cancer_related <- sapply(ora_results$Description, is_cancer_related)

# Create comprehensive summary report
summary_report <- list()

summary_report$analysis_summary <- data.frame(
    Metric = c("Input genes", "Genes converted to Entrez ID", "Conversion rate", 
               "Total enriched pathways", "Significant pathways (p.adj < 0.05)",
               "Cancer-related pathways", "Analysis method"),
    Value = c("70", "66", "94.3%", 
              nrow(ora_results), sum(ora_results$p.adjust < 0.05),
              sum(ora_results$cancer_related), "Over-representation Analysis (ORA)")
)

# Top pathways ranked by fold enrichment
summary_report$top_by_enrichment <- ora_results[order(ora_results$fold_enrichment, decreasing = TRUE), 
                                               c("ID", "Description", "fold_enrichment", "pvalue", "p.adjust", "Count")]

# Top pathways ranked by significance
summary_report$top_by_significance <- ora_results[order(ora_results$p.adjust), 
                                                 c("ID", "Description", "fold_enrichment", "pvalue", "p.adjust", "Count")]

# Cancer-related pathways
summary_report$cancer_pathways <- ora_results[ora_results$cancer_related, 
                                             c("ID", "Description", "fold_enrichment", "pvalue", "p.adjust", "Count")]

# Print the summary report
cat("=== KEGG PATHWAY ENRICHMENT ANALYSIS SUMMARY ===\n\n")

cat("ANALYSIS OVERVIEW:\n")
print(summary_report$analysis_summary, row.names = FALSE)
cat("\n")

cat("TOP 10 PATHWAYS BY FOLD ENRICHMENT:\n")
print(head(summary_report$top_by_enrichment, 10), row.names = FALSE)
cat("\n")

cat("TOP 10 PATHWAYS BY SIGNIFICANCE (Adjusted P-value):\n")
print(head(summary_report$top_by_significance, 10), row.names = FALSE)
cat("\n")

if (nrow(summary_report$cancer_pathways) > 0) {
    cat("CANCER-RELATED PATHWAYS IDENTIFIED:\n")
    print(summary_report$cancer_pathways, row.names = FALSE)
} else {
    cat("CANCER-RELATED PATHWAYS:\n")
    cat("No explicitly cancer-related pathways found using current keywords.\n")
    cat("However, some pathways may be indirectly cancer-relevant:\n")
    cat("- Longevity regulating pathway may relate to cellular aging and cancer\n")
    cat("- Insulin signaling pathway is involved in growth regulation\n")
    cat("- Signaling pathways regulating pluripotency may relate to stem cell biology\n")
}
cat("\n")

# Save detailed results to CSV
final_results <- ora_results[order(ora_results$p.adjust), 
                           c("ID", "Description", "pvalue", "p.adjust", "qvalue", "Count", 
                             "GeneRatio", "BgRatio", "fold_enrichment", "cancer_related", "geneID")]

write.csv(final_results, "/workdir/execution_outputs/kegg_enrichment_complete_results.csv", row.names = FALSE)

cat("=== FILES GENERATED ===\n")
cat("1. /workdir/execution_outputs/kegg_ora_results.csv - Raw ORA results\n")
cat("2. /workdir/execution_outputs/kegg_enrichment_complete_results.csv - Complete annotated results\n")
cat("3. /workdir/execution_outputs/kegg_enrichment_barplot.png - Static bar plot\n")
cat("4. /workdir/execution_outputs/kegg_enrichment_interactive.html - Interactive bar plot\n")

cat("\n=== KEY FINDINGS ===\n")
cat("• Successfully performed KEGG pathway enrichment analysis using 70 differentially expressed genes\n")
cat("• Found", nrow(ora_results), "significantly enriched pathways using over-representation analysis\n")
cat("• Top pathway by fold enrichment:", summary_report$top_by_enrichment$Description[1], 
    "(", round(summary_report$top_by_enrichment$fold_enrichment[1], 2), "-fold enriched)\n")
cat("• Most significant pathway:", summary_report$top_by_significance$Description[1], 
    "(p.adj =", format(summary_report$top_by_significance$p.adjust[1], scientific = TRUE), ")\n")
cat("• Generated visualization showing top 10 pathways by fold enrichment\n")