# Import the SDK and the client module
# from label_studio_sdk.client import LabelStudio
from collections import deque
import json
import copy
import sys, os
import shutil
from loguru import logger
import random
from pathlib import Path
from transformers import AutoTokenizer
from tqdm import tqdm
from urllib.parse import urlparse, unquote

from common_prompts import translate as translate
from common_prompts import LABEL_STUDIO_URL as LABEL_STUDIO_URL
from common_prompts import API_KEY as API_KEY

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gpt_api.unigpt import GPT
from gpt_tools.deepseek_tools import DeepseekChat

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from markdown_it.common_tools import if_chart as if_chart


def parse_image_name_from_url(image_root_path: str, url: str) -> tuple[str, str]:
    parsed_path = urlparse(url).path
    encoded_filename = os.path.basename(parsed_path)
    local_filename = unquote(encoded_filename)
    return local_filename, os.path.join(image_root_path, local_filename)


struct_data = {
    "data": {
        "file_name": "",
        "image_captions": [
            # {
            #     "caption": "",
            #     "image": "",
            #     "image_name": "",
            #     "caption_translation": ""
            # }
        ],
    }
}

if __name__ == "__main__":
    vl_qwen_chat = GPT(model="qwen2.5-vl-7b", vendor="", stream=False, temperature=0.2)
    text_qwen_chat = GPT(model="qwen3_32b", vendor="", stream=False, temperature=0.2)
    local_tokenizer_path = Path("models/Qwen2.5-VL-7B-Instruct")
    # tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path, trust_remote_code=True)

    root_file_path = "data/chartdata/english/pewresearch"
    content_path = os.path.join(root_file_path, "contents")
    images_path = os.path.join(root_file_path, "images")
    output_path = "data/chartQA/label_studio_formal/data_to_annotate/pewresearch"
    output_images_path = os.path.join(output_path, "images")
    output_json_dir = os.path.join(output_path, "jsons")
    label_studio_image_prefix = "/data/local-files/?d=pewresearch/images/"

    os.makedirs(output_path, exist_ok=True)
    os.makedirs(output_images_path, exist_ok=True)
    os.makedirs(output_json_dir, exist_ok=True)

    with os.scandir(content_path) as entries:
        failed_filenames = []
        for entry in entries:
            if entry.name.endswith(".json"):
                ls_json = copy.deepcopy(struct_data)

                ls_json["data"]["file_name"] = entry.name
                output_json_path = os.path.join(output_json_dir, entry.name)
                if os.path.exists(output_json_path):  # INFO jump the exist output
                    logger.info(f"file exist: {output_json_path}")
                    continue
                try:
                    entry_path = os.path.join(content_path, entry.name)
                    with open(entry_path, "r") as f:
                        entry_json = json.load(f)
                        copyed_entry_json = deque(copy.deepcopy(entry_json))
                        image_caption_index = 0  # used to count which context should be appeded to which image
                        captions = []
                        for i in range(len(entry_json)):
                            text = entry_json[i]
                            if text["type"] == "image":
                                image_name, image_path = parse_image_name_from_url(images_path, text["src"])
                                if not if_chart(vl_qwen_chat, image_path=image_path):  # INFO judge if charts
                                    continue
                                image_caption = {
                                    "caption": "",
                                    "image": "",
                                    "image_name": "",
                                    "caption_translation": "",
                                }
                                # if len(captions) == 0:
                                #     continue
                                image_caption["image_name"] = image_name
                                image_caption["image"] = label_studio_image_prefix + image_name
                                # image_caption["caption"] = captions[image_caption_index]
                                # image_caption["caption_translation"] = translate(
                                #     text_qwen_chat, captions[image_caption_index]
                                # )
                                image_caption_index += 1
                                ls_json["data"]["image_captions"].append(image_caption)
                                shutil.copy(
                                    os.path.join(images_path, image_name),
                                    os.path.join(output_images_path, image_name),
                                )
                                logger.info(f"image add: {image_name}")
                            else:
                                if len(captions) == 0 or image_caption_index > len(captions) - 1:
                                    captions.append(text["content"])
                                captions[image_caption_index] = captions[image_caption_index] + " " + text["content"]
                                logger.info(f"caption add: {text['content']}")
                        for i in range(len(ls_json["data"]["image_captions"])):
                            if i < len(captions):
                                ls_json["data"]["image_captions"][i]["caption"] = captions[i]
                                ls_json["data"]["image_captions"][i]["caption_translation"] = translate(
                                    text_qwen_chat, captions[i]
                                )

                    with open(output_json_path, "w", encoding="utf-8") as f:
                        json.dump(ls_json, f, ensure_ascii=False, indent=4)
                        logger.info(f"saved file: {output_json_path}")
                except Exception as e:
                    failed_filenames.append(entry.name)
                    logger.error(f"{e}")
                # exit()  # WARN exit()
        print(failed_filenames)

# NOTE failed
# ['about-1-in-4-us-teachers-say-their-school-went-into-a-gun-related-lockdown-in-the-last-school-year.json', 'income-inequality-is-greater-among-chinese-americans-than-any-other-asian-origin-group-in-the-us.json', 'key-facts-about-americans-and-guns.json', 'republicans-think-economy-will-improve-over-the-next-year-democrats-expect-it-to-get-worse.json', 'striking-findings-from-2023.json', 'striking-findings-from-2024.json']
