########################################################################
# 0.  Raw numbers  ----------------------------------------------------
#     Order: Depth-M1 → Depth-M2 → Depth-M3 → Breadth-M1 → Breadth-M2 → Breadth-M3
########################################################################

# ── semantic-similarity (↑ higher is better) ──────────────────────────
sem_vec <- c(
  0.49,0.45,0.46,0.46,0.48,0.46,
  0.42,0.41,0.41,0.40,0.41,0.41,
  0.41,0.40,0.40,0.38,0.40,0.40,
  0.49,0.42,0.43,0.43,0.48,0.45,
  0.41,0.42,0.39,0.38,0.41,0.40,
  0.40,0.41,0.38,0.36,0.39,0.38
)

# ── ROUGE-L (↑ higher is better) ─────────────────────────────────────
rouge_vec <- c(
  0.11,0.06,0.08,0.08,0.08,0.08,
  0.09,0.05,0.06,0.06,0.06,0.07,
  0.08,0.05,0.06,0.06,0.06,0.07,
  0.10,0.05,0.06,0.07,0.07,0.07,
  0.08,0.05,0.05,0.05,0.06,0.06,
  0.08,0.05,0.05,0.05,0.06,0.06
)

# ── Likert-distance (↓ lower is **better**) ───────────────────────────
likert_vec <- c(
  1.13,1.20,1.22,1.15,1.17,1.16,   # Depth M1
  1.25,1.33,1.28,1.18,1.21,1.25,   # Depth M2
  1.30,1.47,1.33,1.28,1.26,1.30,   # Depth M3
  1.01,1.30,1.29,1.16,1.11,1.05,   # Breadth M1
  1.17,1.25,1.35,1.22,1.18,1.16,   # Breadth M2
  1.22,1.29,1.39,1.23,1.22,1.22    # Breadth M3
)

########################################################################
# 1.  Helper: reshape vector → 6×6 matrix  ----------------------------
########################################################################
to_matrix <- function(v) {
  matrix(v, nrow = 6, byrow = TRUE,
         dimnames = list(paste0("Set", 1:6),
                         c("gpt4o","Tulu8B","Llama8B",
                           "Llama70B","Mistral7B","Qwen32B")))
}

########################################################################
# 2.  Define your metrics here  ---------------------------------------
########################################################################
metrics <- list(
  semantic_similarity = list(mat = to_matrix(sem_vec),   higher = TRUE),
  rouge_L             = list(mat = to_matrix(rouge_vec), higher = TRUE),
  likert_diff         = list(mat = to_matrix(likert_vec), higher = FALSE)
)

########################################################################
# 3.  Core statistical routine  ---------------------------------------
########################################################################
analyze_metric <- function(mat, higher, name) {
  cat("\n==============================================\n")
  cat("Results for", name, ifelse(higher,"(higher ↑ better)","(lower ↓ better)"), "\n")
  cat("==============================================\n")
  
  ## ❶ Friedman omnibus
  print(friedman.test(mat))
  
  ## ❷ Pairwise Wilcoxon vs. GPT-4o-mini
  baseline_cols <- setdiff(colnames(mat), "gpt4o")
  gpt <- mat[,"gpt4o"]
  
  ## choose test direction
  alt <- if (higher) "greater" else "less"
  
  raw_p <- sapply(baseline_cols, function(m)
    wilcox.test(gpt, mat[,m], paired = TRUE,
                alternative = alt, exact = FALSE)$p.value)
  
  holm_p <- p.adjust(raw_p, method = "holm")
  
  ## effect size sign: positive values mean “GPT-4o better”
  mean_diff <- sapply(baseline_cols, function(m)
    if (higher)  gpt - mat[,m]           # higher metric → subtract baseline
    else         mat[,m] - gpt)          # lower metric → invert sign
  
  out <- data.frame(model      = baseline_cols,
                    mean_diff  = round(colMeans(mean_diff), 3),
                    raw_p      = signif(raw_p, 3),
                    holm_p     = signif(holm_p, 3))
  print(out, row.names = FALSE)
  invisible(out)
}

########################################################################
# 4.  Run everything  --------------------------------------------------
########################################################################
results <- lapply(names(metrics), function(nm)
  with(metrics[[nm]], analyze_metric(mat, higher, nm)))



# ==============================================
#   Results for semantic_similarity (higher ↑ better) 
# ==============================================
#   
#   Friedman rank sum test
# 
# data:  mat
# Friedman chi-squared = 17.869, df = 5, p-value = 0.003115
# 
# model mean_diff  raw_p holm_p
# Tulu8B     0.018 0.1460  0.146
# Llama8B     0.025 0.0180  0.085
# Llama70B     0.035 0.0170  0.085
# Mistral7B     0.008 0.0239  0.085
# Qwen32B     0.020 0.0178  0.085
# 
# ==============================================
#   Results for rouge_L (higher ↑ better) 
# ==============================================
#   
#   Friedman rank sum test
# 
# data:  mat
# Friedman chi-squared = 26.348, df = 5, p-value = 7.637e-05
# 
# model mean_diff  raw_p holm_p
# Tulu8B     0.038 0.0168 0.0657
# Llama8B     0.030 0.0155 0.0657
# Llama70B     0.028 0.0131 0.0657
# Mistral7B     0.025 0.0160 0.0657
# Qwen32B     0.022 0.0175 0.0657
# 
# ==============================================
#   Results for likert_diff (lower ↓ better) 
# ==============================================
#   
#   Friedman rank sum test
# 
# data:  mat
# Friedman chi-squared = 21.569, df = 5, p-value = 0.0006323
# 
# model mean_diff  raw_p holm_p
# Tulu8B     0.127 0.0175 0.0875
# Llama8B     0.130 0.0178 0.0875
# Llama70B     0.023 0.3000 0.6340
# Mistral7B     0.012 0.3920 0.6340
# Qwen32B     0.010 0.2110 0.6340