import pandas as pd
import json
from datasets import Dataset
#convert parquet to jsonl
input_path = "medical-train-00000-of-00001.parquet"
output_path = "medical_dataset_aligned.jsonl"

df = pd.read_parquet(input_path)
df.to_json(output_path, orient="records", lines=True, force_ascii=False)

print(f"Saved as JSONL to {output_path}")
#convert into chat format
rows = df.to_dict(orient="records")
chat_rows = [
    {
        "messages": [
            {"role": "user", "content": r["prompt"]},
            {"role": "assistant", "content": r["aligned"]}
        ]
    } for r in rows if "prompt" in r and "aligned" in r
]

chat_output_path = "medical_dataset_aligned.jsonl"
with open(chat_output_path, "w", encoding="utf-8") as f:
    for entry in chat_rows:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Converted format saved: {chat_output_path}")
