import argparse
import csv
import json
import logging
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def extract_relevant_data(file_path):
    """Extracts relevant fields from a JSON file and processes the response content."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        # Extract required fields
        extracted_data = {
            key: data[key]
            for key in [
                "q_id",
                "country",
                "category",
                "subcategory",
                "topic",
                "query",
                "topic_rank",
                "ranked_index",
                "image_id",
                "image_url",
                "image_info",
                "image_path",
                "description",
                "extracted_text",
                "image_category",
                "status",
                "reason",
            ]
            if key in data
        }

        image_url = data.get("image_url", "")  # Extract image URL

        # Parse response content
        response = data.get("response", {})
        choices = response.get("choices", [])
        questions_answers = []

        if choices:
            message_content = choices[0].get("message", {}).get("content", "{}")
            try:
                parsed_content = json.loads(message_content)
                extracted_data.update({"response": parsed_content})

                # Extract questions and answers for TSV in a single row with image URL
                qa_row = [image_url]
                for category in [
                    "open-ended",
                    "multiple-choice",
                    "knowledge-based",
                    "commonsense-based",
                ]:
                    for qa in parsed_content.get(category, []):
                        if category == "multiple-choice":
                            if (
                                "question_en" in qa
                                and "options_en" in qa
                                and "correct_answer_en" in qa
                            ):
                                options = ", ".join(qa["options_en"])
                                qa_row.extend(
                                    [
                                        qa["question_en"],
                                        options,
                                        qa["correct_answer_en"],
                                    ]
                                )
                            if (
                                "question_ar" in qa
                                and "options_ar" in qa
                                and "correct_answer_ar" in qa
                            ):
                                options = ", ".join(qa["options_ar"])
                                qa_row.extend(
                                    [
                                        qa["question_ar"],
                                        options,
                                        qa["correct_answer_ar"],
                                    ]
                                )
                        else:
                            if "question_en" in qa and "answer_en" in qa:
                                qa_row.extend([qa["question_en"], qa["answer_en"]])
                            if "question_ar" in qa and "answer_ar" in qa:
                                qa_row.extend([qa["question_ar"], qa["answer_ar"]])

                if qa_row:
                    questions_answers.append(qa_row)

            except json.JSONDecodeError:
                logging.warning(
                    f"Invalid JSON format in response content in file: {file_path}"
                )
                extracted_data.update({"response": {}})
        else:
            extracted_data.update({"response": {}})

        return extracted_data, questions_answers
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")
        return None, []


def process_directory(input_dir, output_jsonl, output_tsv):
    """Processes all JSON files in the directory and writes them to JSONL and TSV files."""
    if not os.path.exists(input_dir):
        logging.error(f"Input directory '{input_dir}' does not exist.")
        return

    output_data = []
    tsv_data = []

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                # logging.info(f"Processing file: {file_path}")
                extracted_data, questions_answers = extract_relevant_data(file_path)
                if extracted_data:
                    output_data.append(extracted_data)
                if questions_answers:
                    tsv_data.extend(questions_answers)

    # Write to JSONL file
    if output_data:
        with open(output_jsonl, "w", encoding="utf-8") as out_file:
            for entry in output_data:
                out_file.write(json.dumps(entry, ensure_ascii=False) + "\n")
        logging.info(f"Processed data written to {output_jsonl}")
    else:
        logging.warning("No valid JSON files processed.")

    # Write to TSV file
    if tsv_data:
        with open(output_tsv, "w", encoding="utf-8", newline="") as tsv_file:
            writer = csv.writer(tsv_file, delimiter="\t")
            writer.writerow(
                [
                    "Image_URL",
                    "Q1",
                    "A1",
                    "Q2",
                    "A2",
                    "Q3",
                    "A3",
                    "Q4",
                    "A4",
                    "Q5",
                    "A5",
                    "Q6",
                    "A6",
                    "Q7",
                    "A7",
                    "Q8",
                    "A8",
                    "MC_Q1",
                    "MC_Options1",
                    "MC_Answer1",
                    "MC_Q2",
                    "MC_Options2",
                    "MC_Answer2",
                ]
            )
            writer.writerows(tsv_data)
        logging.info(
            f"Questions, answers, multiple-choice options, and image URLs written to {output_tsv}"
        )
    else:
        logging.warning("No questions and answers found.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parse JSON files and extract relevant data."
    )
    parser.add_argument("-i", "--input_dir", help="Directory containing JSON files.")
    parser.add_argument("-j", "--output", help="Output JSONL file.")
    parser.add_argument(
        "-o", "--output_tsv", help="Output TSV file for questions and answers."
    )
    args = parser.parse_args()

    process_directory(args.input_dir, args.output, args.output_tsv)
