import os
import json
import pandas as pd
import re


def process_single_json(file_path):
    print(file_path)
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        segment_scores = []
        processed_items = []

        for item in data:
            if "output" not in item:
                continue

            output = item["output"]
            think_delimiter = "</think>"
            if think_delimiter in output:
                last_delimiter_pos = output.rfind(think_delimiter)
                parsed_output = output[last_delimiter_pos + len(think_delimiter) :]
            else:
                parsed_output = output

            score_pattern = r"-?\d+\.\d+"
            score_match = re.findall(score_pattern, parsed_output)
            if score_match:
                score = float(score_match[-1])
            else:
                score = 0

            segment_scores.append(score)

            processed_items.append({"output": parsed_output, "score": score})

        doc_score = sum(segment_scores) / len(segment_scores) if segment_scores else 0

        return {
            "score": doc_score,
            "segment_count": len(segment_scores),
            "processed_items": processed_items,
            "segment_scores": segment_scores,
        }

    except Exception as e:
        print(f"Error in {file_path} as: {str(e)}")
        return None


def process_directory(
    directory_path,
    table_directory,
    clean_dictionary,
    output_directory=None,
    mo=None,
    lg=None,
):

    if output_directory is None:
        output_directory = os.path.join(directory_path, "processed")

    results = []
    sys_entries = []
    seg_entries = []

    os.makedirs(output_directory, exist_ok=True)

    for root, _, files in os.walk(directory_path):
        for file in files:
            if not file.endswith(".json"):
                continue

            file_path = os.path.join(root, file)
            result = process_single_json(file_path)

            if result:
                new_filename = f"{file}"
                new_file_path = os.path.join(clean_dictionary, new_filename)

                os.makedirs(os.path.dirname(new_file_path), exist_ok=True)

                with open(new_file_path, "w", encoding="utf-8") as f:
                    json.dump(
                        result["processed_items"], f, ensure_ascii=False, indent=4
                    )

                results.append(
                    {
                        "file_name": file,
                        "path": file_path,
                        "score": result["score"],
                        "segment_count": result["segment_count"],
                    }
                )

                base_name = os.path.splitext(file)[0]
                sys_entries.append((base_name, result["score"]))
                seg_scores = result["segment_scores"]
                for score in seg_scores:
                    seg_entries.append((base_name, score))

    df = pd.DataFrame(results)

    if mo and lg:
        seg_filename = os.path.join(output_directory, f"{mo}_{lg}.seg.score")
        sys_filename = os.path.join(output_directory, f"{mo}_{lg}.sys.score")

        with open(sys_filename, "w", encoding="utf-8") as f_sys:
            for base, score in sys_entries:
                f_sys.write(f"{base}\t{score}\n")

        with open(seg_filename, "w", encoding="utf-8") as f_seg:
            for base, score in seg_entries:
                score_str = str(score) if score is not None else "None"
                f_seg.write(f"{base}\t{score_str}\n")

    return df


lang_pairs = ["en-de", "ja-zh", "en-es"]
models = ["thin_mqm_32b"]

for lg in lang_pairs:
    for mo in models:
        input_dir = f"lrm_results/{mo}/{lg}"
        clean_dir = f"lrm_results/clean/{mo}/{lg}"
        output_dir = f"lrm_results/scores/{mo}"
        table_dir = f"lrm_results/clean/{mo}_{lg}_quality_report.xlsx"

        results_df = process_directory(
            input_dir, table_dir, clean_dir, output_dir, mo=mo, lg=lg
        )
