import os
import json
import argparse
from tqdm import tqdm
from openai import OpenAI


def generate_embedding(client, question, answer, model_name):
    text = f"Question:{question}\nAnswer:{answer}"
    resp = client.embeddings.create(model=model_name, input=text)
    return resp.data[0].embedding


def load_existing_ids(output_file):
    if not os.path.exists(output_file):
        return set()
    with open(output_file, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            return set(item.get("id") for item in data if isinstance(item, dict) and "id" in item)
        except json.JSONDecodeError:
            return set()


def append_to_json(output_file, record):
    if not os.path.exists(output_file):
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump([record], f, ensure_ascii=False, indent=2)
        return
    with open(output_file, "r+", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            data = []
        data.append(record)
        f.seek(0)
        f.truncate()
        json.dump(data, f, ensure_ascii=False, indent=2)


def run(args):
    api_key = os.environ.get("OPENAI_API_KEY")
    base_url = os.environ.get("OPENAI_BASE_URL")
    model_name = os.environ.get("OPENAI_EMBED_MODEL", "text-embedding-3-large")

    client_kwargs = {}
    if api_key:
        client_kwargs["api_key"] = api_key
    if base_url:
        client_kwargs["base_url"] = base_url
    client = OpenAI(**client_kwargs)

    with open(args.data_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    finished_ids = load_existing_ids(args.output_file)

    for item in tqdm(data):
        if item.get("id") in finished_ids:
            continue
        try:
            emb = generate_embedding(client, item.get("query", ""), item.get("answer", ""), model_name)
        except Exception as e:
            emb = f"[ERROR] {e}"
        item["embedding"] = emb
        append_to_json(args.output_file, item)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-path", required=True)
    parser.add_argument("--output-file", required=True)
    args = parser.parse_args()
    run(args)
