import pandas as pd
from pathlib import Path

def build_train_csv(
    src_csv: str = "train_split.csv",
    prompt_file: str = "prompt.txt",
    dst_csv: str = "train.csv",
):
    # 1. 读 csv
    df = pd.read_csv(src_csv)

    # 2. 读 prompt 模板
    prompt_template = Path(prompt_file).read_text(encoding="utf-8").strip()

    # 3. 组装数据
    records = []
    for _, row in df.iterrows():
        # 把 NaN 转成空字符串，防止模板填不进去
        row_dict = {k: (str(v) if pd.notnull(v) else "") for k, v in row.items()}

        # 只保留模板需要的 key，防止 KeyError
        safe_dict = {k: row_dict.get(k, "") for k in
                     ["Aryl_halide_SMILES", "Additive_SMILES", "Base_SMILES", "Ligand_SMILES"]}

        instruction = prompt_template.format(**safe_dict)

        records.append(
            {
                "instruction": instruction,
                "input": "",                # 某些框架需要，可为空
                "output": str(row["yield"]) if pd.notnull(row["yield"]) else "",
                "history": "[]",            # 空对话历史，写成字符串
            }
        )

    # 4. 写 csv
    new_df = pd.DataFrame(records)
    new_df.to_csv(dst_csv, index=False, encoding="utf-8")
    print(f"Saved {len(new_df)} rows to {dst_csv}")

if __name__ == "__main__":
    build_train_csv(
        src_csv="test_split.csv",
        dst_csv="test.csv"
    )