#!/usr/bin/env bash
# This is a script for evaluating the model for test case on our metric.
##################################################################################################
### **Default parameters**
MODES=(
  "functionality_specification"
  "assert_specification"
)

VERBOSE="True" # Whether to print verbose output?

OUTPUT_DIR="../../code/evaluation_test_case_our_metric"
mkdir -p "$OUTPUT_DIR"

LINE_WEIGHT="0.5" # Line weight
BRANCH_WEIGHT="0.5" # Branch weight
CORR_WEIGHT="0.7" # Correlation weight
AVC_WEIGHT="0.40" # AVC weight
TS_WEIGHT="0.45" # TS weight
CE_WEIGHT="0.15" # CE weight
##################################################################################################
### **Custom parameters**
DATASET="humaneval" # What dataset to use?
# Components: "humaneval", "mbpp"

OUTPUT_JSONLS_MODE="ours" # What to use for test case?
# Components: "base", "ours", "gpt", "original_dataset"

##################################################################################################
if [[ "$DATASET" == "humaneval" ]]; then
  CANON_JSONL="../../data/evalplus-0.1.1/HumanEvalPlus.jsonl"
else
  CANON_JSONL="../../data/mbppplus-0.2.0/MbppPlus.jsonl"
fi

if [[ "$OUTPUT_JSONLS_MODE" == "base" ]]; then
  MODELS=(
    "DeepSeek"
    #"Mistral"
  )
elif [[ "$OUTPUT_JSONLS_MODE" == "ours" ]]; then
  MODELS=(
    "SFT-DeepSeek"
    "RL-DeepSeek"
  )
elif [[ "$OUTPUT_JSONLS_MODE" == "gpt" ]]; then
  MODELS=(
    "chatgpt"
  )
elif [[ "$OUTPUT_JSONLS_MODE" == "original_dataset" ]]; then
  MODELS=(
    "original_dataset"
  )
fi



for MODE in "${MODES[@]}"; do
  if [[ "$MODE" == "functionality_specification" ]]; then
    SUBDIR="functionality"
    SUBSUBDIR="functionality_specification"
    LOAD_VARIANT="no_contracts"
  else
    SUBDIR="contracts"
    SUBSUBDIR="assert_specification"
    LOAD_VARIANT="in_contracts"
  fi

  OUTPUT_JSONLS=()
  for MODEL in "${MODELS[@]}"; do
    if [[ "$OUTPUT_JSONLS_MODE" == "original_dataset" ]]; then
      if [[ "$DATASET" == "humaneval" ]]; then
        OUTPUT_JSONLS+=("../../data/evalplus-0.1.1/HumanEvalPlus.jsonl")
      elif [[ "$DATASET" == "mbpp" ]]; then
        OUTPUT_JSONLS+=("../../data/mbppplus-0.2.0/MbppPlus.jsonl")
      fi
    else
      # # 2 - filtering # only contracts or functionality
      # OUTPUT_JSONLS+=("../../code/evaluation_test_case_pass_k/${DATASET}/pre_filtering/${SUBSUBDIR}/${MODEL}/${MODEL}_pre_filtering_${SUBDIR}_results_filtered.json")
      # 1 - filtering # contracts + functionality : we use this one
      OUTPUT_JSONLS+=("../../code/evaluation_test_case_pass_k/${DATASET}/pre_filtering/${SUBSUBDIR}/${MODEL}/${MODEL}_pre_filtering_results_filtered.json")
    fi
  done
  
  for OUTPUT_JSONL in "${OUTPUT_JSONLS[@]}"; do
    if [[ ! -f "$OUTPUT_JSONL" ]]; then
      echo "⚠️  (missing, skipped) $OUTPUT_JSONL"
      continue
    fi

    if [[ "$OUTPUT_JSONL" == *".jsonl" ]]; then
      MODEL_TAG=$(basename "$OUTPUT_JSONL" .jsonl)
    else
      MODEL_TAG=$(basename "$OUTPUT_JSONL" .json)
    fi
  
    MODEL_TAG2="${MODEL_TAG%%_pre_filtering*}"

    echo -e "\n▶ Evaluating: $MODEL_TAG (Mode: $MODE)"

    python ../../code/utils/evaluation_test_case_our_metric.py \
      --data "$DATASET" \
      --canon_jsonl "$CANON_JSONL" \
      --output_jsonl "$OUTPUT_JSONL" \
      --mode "$MODE" \
      --line-weight "$LINE_WEIGHT" \
      --branch-weight "$BRANCH_WEIGHT" \
      --corr-weight "$CORR_WEIGHT" \
      --avc-weight "$AVC_WEIGHT" \
      --ts-weight "$TS_WEIGHT" \
      --ce-weight "$CE_WEIGHT" \
      --load-variant "$LOAD_VARIANT" \
      --verbose "$VERBOSE" \
      --output_dir "$OUTPUT_DIR/$DATASET/$MODE/$MODEL_TAG2/$MODEL_TAG"
  done
done