import json
import random
import base64

def read_prompt_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().strip()

# Read answer prompt from file, for example:
# "Please provide a direct answer to the question without any additional explanations or comments. Only output the final answer, keeping the format concise and clear."
ANSWER_PROMPT_cot = read_prompt_from_file("./chart_cot/prompt/combined_cot.txt")

prompt_cot='''
========================================
ROLE
========================================
You are an expert vision-language analyst.  
Your job is to look at the image, read the question, and provide a answer.

========================================
CRITICAL RULES (must follow all)
========================================
1.  **STEPBYSTEP THINKING:** You need to think step-by-step first before answering the question.Your thought process (which you may output in the <think> tag) should explicitly focus on:
    *   **Axes:** What do the horizontal (X-axis) and vertical (Y-axis) represent? Note their labels, units, and scale.
    *   **Data Points:** Locate the specific bars, points, lines, or other points relevant to the question.
    *   **Context:** Read the chart's title, legend, and any other text to fully understand the context.
2.  **FINAL ANSWER** Your output MUST contain the answer tag: `<answer>your answer</answer>`.
3.  **STRICT FORMAT:** The answer inside the `<answer>` tag must be the final, concise result (e.g., a single number). Do not include explanations or units unless required by the chart's notation.

========================================
INPUT FIELDS
========================================
Question      : {original_question}  
'''

def load_existing_records(filepath):
    records = []
    try:
        with open(filepath, "r", encoding="utf-8") as fin:
            for line in fin:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
    except FileNotFoundError:
        # If file does not exist, return empty list
        pass
    return records

def get_thought_string(thought):
    if isinstance(thought, dict):
        steps = []
        for i in range(1, 6):
            step_key = f"step{i}-thought"
            if step_key in thought:
                steps.append(thought[step_key])
        return "".join(steps)
    elif isinstance(thought, str):
        return thought
    else:
        return ""

def convert_record(record):
    if not record.get("answers"):
        return []

    images = []
    if "image" in record and record["image"]:
        images = [record["image"]]

    new_records = []
    forbidden = ["given answer", "original answer"]

    for data in record.get("answers", []):
        question = data.get("question", "")
        answer   = data.get("answer", "") or ""
        thought_data = data.get("thought", "") or ""

        # —— New addition: Filter records containing sensitive substrings —— 
        combined = (thought_data + " " + answer).lower()
        if any(phrase in combined for phrase in forbidden):
            # Skip this QA
            continue

        user_content = prompt_cot
        assistant_content = (
            "<think>" + thought_data + "</think>\n"
            + "<answer>" + answer + "</answer>"
        )

        # rec = {
        #     "messages": [
        #         {
        #             "role": "user",
        #             "content": (
        #                 prompt_cot
        #                 + "\n<image>"
        #                 + question
        #                 + "\nOutput the thinking process in <think> </think> "
        #                   "and final answer in <answer> </answer> tags"
        #             )
        #         },
        #         {"role": "assistant", "content": assistant_content}
        #     ]
        # }
        rec = {
            "messages": [
                {
                    "role": "user",
                    "content": (
                        "<image>"
                        + prompt_cot.format(original_question=question)
                    )
                },
                {"role": "assistant", "content": assistant_content}
            ]
        }
        if images:
            rec["images"] = images

        new_records.append(rec)

    return new_records


def main():
    input_file = ''
    output_file = ''

    # Read existing records
    existing_records = load_existing_records(output_file)

    new_records = []
    max_records = 180000
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            if len(new_records) >= max_records:
                break
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Parsing error: {e}")
                continue
            converted = convert_record(record)
            if converted:
                new_records.extend(converted)

    print(f"Converted {len(new_records)} new records.")
    all_records = existing_records + new_records

    random.shuffle(all_records)

    with open(output_file, "w", encoding="utf-8") as fout:
        for rec in all_records:
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    main()
