library(dplyr)
library(readr)
library(plotly)
library(htmlwidgets)

# Load data
clusters <- read_csv("/workdir/patient_cluster_assignment.csv")
mrna <- read_csv("/workdir/KIRC_mRNA_top_column_cleaned.csv")

print("Setting up data for differential expression analysis...")

# Convert mRNA data to matrix format (genes x patients)
gene_names <- mrna$unnamed_0
mrna_matrix <- as.matrix(mrna[, -1])
rownames(mrna_matrix) <- gene_names

# Create cluster assignment vector matching column order
cluster_df <- clusters %>%
  arrange(match(patient_id, colnames(mrna_matrix)))

print("Performing differential expression analysis...")

# Function to perform t-test and calculate effect size
perform_de_analysis <- function(gene_expr, groups) {
  group1_expr <- gene_expr[groups == 1]
  group2_expr <- gene_expr[groups == 2]
  
  # Remove missing values
  group1_clean <- group1_expr[!is.na(group1_expr) & is.finite(group1_expr)]
  group2_clean <- group2_expr[!is.na(group2_expr) & is.finite(group2_expr)]
  
  if(length(group1_clean) < 3 || length(group2_clean) < 3) {
    return(list(pvalue = 1, effect_size = 0, mean1 = NA, mean2 = NA))
  }
  
  # Calculate means
  mean1 <- mean(group1_clean)
  mean2 <- mean(group2_clean)
  
  # Calculate effect size (simple difference for normalized data)
  effect_size <- mean2 - mean1
  
  # Perform t-test
  tryCatch({
    test_result <- t.test(group2_clean, group1_clean)
    pvalue <- test_result$p.value
  }, error = function(e) {
    pvalue <- 1  # Conservative p-value for failed tests
  })
  
  return(list(pvalue = pvalue, effect_size = effect_size, mean1 = mean1, mean2 = mean2))
}

# Apply analysis to all genes
results_list <- apply(mrna_matrix, 1, function(gene_expr) {
  perform_de_analysis(gene_expr, cluster_df$cluster_assignment)
})

# Compile results into data frame
de_results <- data.frame(
  gene = gene_names,
  mean_cluster1 = sapply(results_list, function(x) x$mean1),
  mean_cluster2 = sapply(results_list, function(x) x$mean2),
  log2_fold_change = sapply(results_list, function(x) x$effect_size),
  pvalue = sapply(results_list, function(x) x$pvalue),
  stringsAsFactors = FALSE
)

# Apply multiple testing correction (Benjamini-Hochberg)
de_results$padj <- p.adjust(de_results$pvalue, method = "BH")

# Add regulation direction
de_results$regulation <- ifelse(de_results$log2_fold_change > 0, "Up in Cluster 2", "Down in Cluster 2")

# Define significance thresholds
pvalue_threshold <- 0.005
n_threshold <- 50

# Apply significance criteria: p < 0.005 OR top 50 genes
de_results$significant <- (de_results$pvalue < pvalue_threshold) | 
                         (rank(de_results$padj) <= n_threshold)

# Rank by adjusted p-value and effect size
de_results_ranked <- de_results %>%
  arrange(padj, desc(abs(log2_fold_change))) %>%
  mutate(rank = row_number())

print("Top 10 most significant genes:")
top10 <- de_results_ranked[1:10, c("gene", "log2_fold_change", "pvalue", "padj", "regulation")]
print(top10)

# Create volcano plot
print("Creating volcano plot...")

# Prepare data for volcano plot
volcano_data <- de_results %>%
  mutate(
    neg_log10_padj = -log10(padj + 1e-300),  # Add small value to avoid -Inf
    is_top10 = gene %in% de_results_ranked$gene[1:10],
    color_group = case_when(
      is_top10 ~ "Top 10",
      significant ~ "Significant",
      TRUE ~ "Not Significant"
    )
  )

# Create interactive volcano plot
volcano_plot <- plot_ly(
  data = volcano_data,
  x = ~log2_fold_change,
  y = ~neg_log10_padj,
  color = ~color_group,
  colors = c("Top 10" = "red", "Significant" = "orange", "Not Significant" = "gray"),
  text = ~paste("Gene:", gene, "<br>",
                "Effect Size:", round(log2_fold_change, 3), "<br>",
                "Adj. P-value:", formatC(padj, format = "e", digits = 2), "<br>",
                "Regulation:", regulation),
  hoverinfo = "text",
  type = "scatter",
  mode = "markers",
  marker = list(size = 4, opacity = 0.7)
) %>%
  add_annotations(
    data = volcano_data[volcano_data$is_top10, ],
    x = ~log2_fold_change,
    y = ~neg_log10_padj,
    text = ~gene,
    showarrow = TRUE,
    arrowcolor = "black",
    arrowsize = 0.5,
    arrowwidth = 1,
    ax = 20,
    ay = -20,
    font = list(size = 10, color = "black")
  ) %>%
  layout(
    title = list(
      text = "Volcano Plot: Differential Gene Expression Between Clusters<br><sub>Cluster 2 vs Cluster 1</sub>",
      font = list(size = 16)
    ),
    xaxis = list(title = "Effect Size (Mean Difference)", zeroline = TRUE),
    yaxis = list(title = "-Log10(Adjusted P-value)", zeroline = FALSE),
    hovermode = "closest",
    showlegend = TRUE,
    legend = list(x = 0.02, y = 0.98),
    annotations = list(
      x = 0.5, y = -0.1, xref = 'paper', yref = 'paper',
      text = paste("Significance criteria: p-value < 0.005 OR top 50 genes<br>",
                  "Sample sizes: Cluster 1 (n=", sum(cluster_df$cluster_assignment == 1), 
                  "), Cluster 2 (n=", sum(cluster_df$cluster_assignment == 2), ")"),
      showarrow = FALSE, font = list(size = 10)
    )
  )

# Save volcano plot
saveWidget(volcano_plot, "/workdir/execution_outputs/volcano_plot_interactive.html")
print("Interactive volcano plot saved to volcano_plot_interactive.html")

# Save final ranked results
significant_genes <- de_results_ranked %>%
  filter(significant) %>%
  select(rank, gene, mean_cluster1, mean_cluster2, log2_fold_change, pvalue, padj, regulation)

write.csv(significant_genes, "/workdir/execution_outputs/significant_genes_ranked.csv", row.names = FALSE)

# Create a top candidates table (top 50 or p < 0.005)
top_candidates <- de_results_ranked %>%
  filter(pvalue < pvalue_threshold | rank <= n_threshold) %>%
  select(rank, gene, mean_cluster1, mean_cluster2, log2_fold_change, pvalue, padj, regulation) %>%
  head(50)

write.csv(top_candidates, "/workdir/execution_outputs/top_candidate_genes.csv", row.names = FALSE)

# Summary statistics
print("\n=== DIFFERENTIAL EXPRESSION ANALYSIS RESULTS ===")
print(paste("Sample sizes: Cluster 1 =", sum(cluster_df$cluster_assignment == 1), 
            ", Cluster 2 =", sum(cluster_df$cluster_assignment == 2)))
print(paste("Total genes analyzed:", nrow(de_results)))
print(paste("Genes with adj. p-value < 0.05:", sum(de_results$padj < 0.05, na.rm = TRUE)))
print(paste("Genes with p-value < 0.005:", sum(de_results$pvalue < pvalue_threshold, na.rm = TRUE)))
print(paste("Total significant genes (p < 0.005 OR top 50):", nrow(significant_genes)))
print(paste("Genes upregulated in Cluster 2:", sum(significant_genes$log2_fold_change > 0, na.rm = TRUE)))
print(paste("Genes downregulated in Cluster 2:", sum(significant_genes$log2_fold_change < 0, na.rm = TRUE)))

print("\nTop 10 candidate genes for biological validation:")
print(top_candidates[1:10, c("gene", "log2_fold_change", "pvalue", "padj", "regulation")])

print("\nOutput files created:")
print("- volcano_plot_interactive.html: Interactive volcano plot")
print("- significant_genes_ranked.csv: All significant genes ranked by adjusted p-value")
print("- top_candidate_genes.csv: Top 50 candidate genes for biological validation")