import os
import json
from markdown_it import MarkdownIt
from loguru import logger
from typing import Union, Dict, List, Optional
import copy
from pathlib import Path


class MD:
    def __init__(self) -> None:
        self.md = MarkdownIt()
        self.init_chapter_level = 0
        self.init_token_index = 0
        self.chapter_struct = {
            "chapter_level": "",
            "title": None,
            "content": [],
            "sub_chapter": [],
        }  # content可能为一个文本段落或子chapter

    def get_init_chartlevel(self, token_tag: str) -> Optional[int]:
        if token_tag.startswith("h"):
            try:
                chapter_level = int(token_tag.split("h")[-1])
                return chapter_level
            except Exception as e:
                return None

    def token2struct(self, tokens, index: int, chapter_level: int) -> tuple[int, dict]:
        chapter_struct = copy.deepcopy(self.chapter_struct)
        chapter_struct["chapter_level"] = f"h{chapter_level}"
        # next_chapter_level = chapter_level + 1
        # last_chapter_level = chapter_level - 1
        while index < len(tokens):
            token = tokens[index]
            current_token_chapter_level = self.get_init_chartlevel(token_tag=token.tag)

            if (
                token.type == "heading_open" and token.tag == f"h{chapter_level}" and chapter_struct["title"] == None
            ):  # 这级标题的文本
                index += 1
                title_token = tokens[index]
                if title_token.type == "inline" and title_token.content:
                    chapter_struct["title"] = title_token.content
                    index += 1
                    continue

            if token.type == "heading_open" and current_token_chapter_level > chapter_level:  # type: ignore # 子章节
                index, next_chapter_struct = self.token2struct(
                    tokens=tokens, index=index, chapter_level=current_token_chapter_level  # type: ignore
                )  # 递归得到子章节的内容
                chapter_struct["content"].append(next_chapter_struct)
                chapter_struct["sub_chapter"].append(next_chapter_struct)
                continue
            elif token.type == "heading_open" and current_token_chapter_level <= chapter_level:  # type: ignore # 同级章节或已经是当前章节的末尾
                return index, chapter_struct
            elif token.type == "paragraph_open":
                index += 1
                inline_token = tokens[index]
                if inline_token.type == "inline" and inline_token.children:
                    text_buffer = ""
                    for child in inline_token.children:
                        if child.type == "text" and child.content.strip():
                            text_buffer += child.content
                        elif child.type == "image":
                            if text_buffer.strip():
                                chapter_struct["content"].append({"type": "text", "content": text_buffer.strip()})
                                text_buffer = ""
                            # 保存图片信息
                            chapter_struct["content"].append(
                                {"type": "image", "alt": child.content, "src": child.attrs.get("src", "")}
                            )
                    if text_buffer.strip():
                        chapter_struct["content"].append({"type": "text", "content": text_buffer.strip()})
            index += 1
        return index, chapter_struct

    def md2json(self, markdown_path: str, if_save=True, output_dir="") -> Optional[Union[Dict, List]]:
        if not os.path.exists(markdown_path):
            logger.warning(f"file not exists {markdown_path}")
            return None

        with open(markdown_path, "r", encoding="utf-8") as f:
            content = f.read()
        tokens = self.md.parse(content)
        _, all_chapters = self.token2struct(
            tokens=tokens, index=self.init_token_index, chapter_level=self.init_chapter_level
        )

        if if_save:
            p = Path(markdown_path)
            json_file_name = p.stem + ".json"
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, json_file_name)
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(all_chapters, f, ensure_ascii=False, indent=4)
                logger.info(f"saved json to: {output_path}")
        else:
            logger.info(f"processed {markdown_path}")
        return all_chapters


if __name__ == "__main__":
    md = MD()
    root_dir = "data/chartdata/processed/mineru_python"
    input_dirs = [
        # "CNNIC",
        # "GlobalTerrorismDatabase",
        # "Guizhou",
        # "NationalBureauOfStatistics",
        # "telecomworld",
        # "worldbank",
        "oced"
    ]
    output_root = "data/chartdata/processed_json"
    for input_dir in input_dirs:
        input_root_dir = os.path.join(root_dir, input_dir)
        with os.scandir(input_root_dir) as entries:
            for entry in entries:
                if entry.is_dir():
                    input_auto_dir = os.path.join(input_root_dir, entry.name, "auto")
                    output_dir = os.path.join(output_root, input_dir)
                    with os.scandir(input_auto_dir) as filenames:
                        for filename in filenames:
                            # for filename in os.listdir(input_auto_dir):
                            if filename.name.endswith((".md", ".markdown")):
                                md_path = os.path.join(input_auto_dir, filename)
                                md.md2json(markdown_path=md_path, if_save=True, output_dir=output_dir)
                            # exit()
