import glob
import json
import os
import re
from typing import Dict, List

def read_json(file_path):
    """
    Reads a JSON file and returns its content as a dictionary.
    
    :param file_path: Path to the JSON file.
    :return: Dictionary containing the JSON data.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def merge_book_documents(book_folder):
    if not os.path.exists(book_folder):
        print(f"错误: 文件夹 {book_folder} 不存在")
        return ""
    
    all_content = []
    txt_files = glob.glob(os.path.join(book_folder, "*.txt"))
    
    if not txt_files:
        print(f"错误: 在 {book_folder} 中未找到.txt文件")
        return ""
    
    print(f"找到 {len(txt_files)} 个文件:")
    for file_path in sorted(txt_files):
        print(f"  - {os.path.basename(file_path)}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    all_content.append(content)
        except Exception as e:
            print(f"读取文件 {file_path} 失败: {e}")
    
    if not all_content:
        print("错误: 所有文件都为空")
        return ""
    
    merged_content = "\n\n".join(all_content)
    print(f"文档合并完成，总长度: {len(merged_content)} 字符")
    return merged_content

def merge_books(book_folders):
    all_books = []
    for book_folder in book_folders:
        if not os.path.exists(book_folder):
            print(f"错误: 文件夹 {book_folder} 不存在")
            continue
        book_content = []
        txt_files = glob.glob(os.path.join(book_folder, "*.txt"))
        if not txt_files:
            print(f"错误: 在 {book_folder} 中未找到.txt文件")
            continue
        print(f"找到 {len(txt_files)} 个文件:")
        for file_path in sorted(txt_files):
            print(f"  - {os.path.basename(file_path)}")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if content:
                        book_content.append(content)
            except Exception as e:
                print(f"读取文件 {file_path} 失败: {e}")
        if not book_content:
            print("错误: 所有文件都为空")
            continue
        merged_content = "\n\n".join(book_content)
        all_books.append(merged_content)
    merged_content = "\n\n".join(all_books)
    print(f"所有文档合并完成，总长度: {len(merged_content)} 字符")
    return merged_content

def convert_to_tokenizer_format(claude_api_payload):
    """Convert Claude API payload format to tokenizer-compatible format"""
    tokenizer_messages = []
    for msg in claude_api_payload:
        if msg["role"] == "user":
            # Extract text content from user messages
            content_text = ""
            assert(len(msg["content"]) == 1), "User message content should be a single text block"
            content_block = msg["content"][0]
            if content_block["type"] == "text":
                content_text = content_block["text"]
                tokenizer_messages.append({
                    "role": "user", 
                    "content": content_text
                })
            elif content_block["type"] == "tool_result":
                tool_use_id = content_block["tool_use_id"]
                tool_content = content_block["content"]
                # Handle tool result content
                tokenizer_messages.append({
                    "role": "tool", 
                    "content": tool_content,
                    "tool_call_id": tool_use_id
                })

        elif msg["role"] == "assistant":
            # Process assistant messages with potential tool calls
            content_text = ""
            tool_calls = []
            for content_block in msg["content"]:
                if content_block["type"] == "text":
                    content_text += content_block["text"]
                elif content_block["type"] == "tool_use":
                    # Convert to OpenAI-style tool call format
                    tool_call = {
                        "id": content_block["id"],
                        "function": {
                            "name": content_block["name"],
                            "arguments": json.dumps(content_block["input"], ensure_ascii=False)
                        },
                        "type": "function"
                    }
                    tool_calls.append(tool_call)
            assistant_msg = {
                "role": "assistant",
                "content": content_text,
                "function_call": tool_calls
            }
            tokenizer_messages.append(assistant_msg)
    return tokenizer_messages

def get_finished_questions_type(result_base_dir: str, book_id_list: List[str]) -> Dict[str, Dict[str, List[str]]]:
    """
    For a given list of book IDs, return a mapping:
        { book_id: { qid: [finished_parts] } }
    """
    finished: Dict[str, Dict[str, List[str]]] = {}

    for book_id in book_id_list:
        finished[book_id] = {}
        book_result_dir = os.path.join(result_base_dir, book_id)

        if not os.path.exists(book_result_dir):
            continue

        for qid in os.listdir(book_result_dir):
            qid_path = os.path.join(book_result_dir, qid)
            if not os.path.isdir(qid_path):
                continue

            parts_status = {"mc": False, "oe": False}
            for part in ("mc", "oe"):
                part_path = os.path.join(qid_path, part)
                if os.path.exists(part_path):
                    files = os.listdir(part_path)
                    parts_status[part] = any(
                        f.startswith("final_result") and f.endswith(".json") for f in files
                    )

            finished_parts = [p for p, done in parts_status.items() if done]
            if finished_parts:
                finished[book_id][qid] = finished_parts

    return finished


def find_bxx_txt_files(directory, reverse=False):
    pattern = re.compile(r'^B\d{2}\.txt$')
    matched_files = []

    for root, _, files in os.walk(directory):
        for file in files:
            if pattern.match(file):
                matched_files.append(file)

    return sorted(matched_files, reverse=reverse)

def build_choice_postprompt(choices: list[str]):
    options = [chr(65 + i) for i in range(len(choices))]
    prompt = "\nPlease write your final answer in the form of "
    for oid, opt in enumerate(options):
        if oid != len(options) - 1:
            prompt += f"\\boxed{{{opt}}}, "
        else:
            prompt += f"or \\boxed{{{opt}}}"
    return prompt


def render_one_question_nqa(data, as_mcq=True):
    question = data["Question"]
    # open_answer = data["Answer"]
    # mcq_answer = data["Gold"]
    open_answer = data.get("Answer", "unknown")
    mcq_answer = data.get("Gold", "unknown")
    # evidences = data.get("Evidences", {})
    question_str = f"Question: {question}"
    if as_mcq: 
        question_str += '\n'
        options = data["Options"]
        for key, val in options.items():
            question_str += f"{key}. {val} \n"
        return question_str, mcq_answer
    return question_str, open_answer

def contains_keyword(text: str, kw: str) -> bool:
    kw_lower = kw.lower()
    text_low = text.lower()
    if re.search(r"[\u4e00-\u9fff]", kw):
        return kw_lower in text_low
    return bool(re.search(rf"\b{re.escape(kw_lower)}\b", text_low, flags=re.IGNORECASE))

def sanitize_for_json(self, obj):
    """Recursively convert SDK / complex objects to plain JSON-safe types."""
    # 1) Try model_dump()/dict()
    if hasattr(obj, "model_dump"):
        obj = obj.model_dump()
    elif hasattr(obj, "dict"):
        obj = obj.dict()

    # 2) Recurse containers
    if isinstance(obj, dict):
        return {k: self._sanitize_for_json(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [self._sanitize_for_json(v) for v in obj]

    # 3) Handle bytes-like
    if isinstance(obj, (bytes, bytearray)):
        try:
            return obj.decode("utf-8", errors="replace")
        except Exception:
            return str(obj)

    # 4) Last-resort: ensure jsonable via round-trip
    try:
        json.dumps(obj, ensure_ascii=False)
        return obj
    except TypeError:
        return str(obj)

import re
import string
from typing import Dict, List, Tuple
import json5


CHINESE_CHAR_RE = re.compile(r'[\u4e00-\u9fff]')

def has_chinese_chars(data) -> bool:
    text = f'{data}'
    return bool(CHINESE_CHAR_RE.search(text))


WORDS_TO_IGNORE = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
    "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
    'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
    's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
    'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
    "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",
    'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
    "wouldn't", '', '\\t', '\\n', '\\\\', '\n', '\t', '\\', ' ', ',', '，', ';', '；', '/', '.', '。', '-', '_', '——', '的',
    '吗', '是', '了', '啊', '呢', '怎么', '如何', '什么', '(', ')', '（', '）', '【', '】', '[', ']', '{', '}', '？', '?', '！', '!',
    '“', '”', '‘', '’', "'", '"', ':', '：', '讲了', '描述', '讲', '总结', 'summarize', '总结下', '总结一下', '文档', '文章', 'article',
    'paper', '文稿', '稿子', '论文', 'PDF', 'pdf', '这个', '这篇', '这', '我', '帮我', '那个', '下', '翻译', '说说', '讲讲', '介绍', 'summary'
]

ENGLISH_PUNCTUATIONS = string.punctuation.replace('%', '').replace('.', '').replace(
    '@', '')  # English punctuations to remove. We're separately handling %, ., and @
CHINESE_PUNCTUATIONS = '。？！，、；：“”‘’（）《》【】……—『』「」_'
PUNCTUATIONS = ENGLISH_PUNCTUATIONS + CHINESE_PUNCTUATIONS


def clean_en_token(token: str) -> str:

    punctuations_to_strip = PUNCTUATIONS

    # Detect if the token is a special case like U.S.A., E-mail, percentage, etc.
    # and skip further processing if that is the case.
    special_cases_pattern = re.compile(r'^(?:[A-Za-z]\.)+|\w+[@]\w+\.\w+|\d+%$|^(?:[\u4e00-\u9fff]+)$')
    if special_cases_pattern.match(token):
        return token

    # Strip unwanted punctuations from front and end
    token = token.strip(punctuations_to_strip)

    return token


def tokenize_and_filter(input_text: str) -> str:
    patterns = r"""(?x)                    # Enable verbose mode, allowing regex to be on multiple lines and ignore whitespace
                (?:[A-Za-z]\.)+          # Match abbreviations, e.g., U.S.A.
                |\d+(?:\.\d+)?%?         # Match numbers, including percentages
                |\w+(?:[-']\w+)*         # Match words, allowing for hyphens and apostrophes
                |(?:[\w\-\']@)+\w+       # Match email addresses
                """

    tokens = re.findall(patterns, input_text)

    stop_words = WORDS_TO_IGNORE

    filtered_tokens = []
    for token in tokens:
        token_lower = clean_en_token(token).lower()
        if token_lower not in stop_words and not all(char in PUNCTUATIONS for char in token_lower):
            filtered_tokens.append(token_lower)

    return filtered_tokens


def string_tokenizer(text: str) -> List[str]:
    text = text.lower().strip()
    if has_chinese_chars(text):
        import jieba
        _wordlist_tmp = list(jieba.lcut(text))
        _wordlist = []
        for word in _wordlist_tmp:
            if not all(char in PUNCTUATIONS for char in word):
                _wordlist.append(word)
    else:
        try:
            _wordlist = tokenize_and_filter(text)
        except Exception:
            _wordlist = text.split()
    _wordlist_res = []
    for word in _wordlist:
        if word in WORDS_TO_IGNORE:
            continue
        else:
            _wordlist_res.append(word)

    import snowballstemmer
    stemmer = snowballstemmer.stemmer('english')
    return stemmer.stemWords(_wordlist_res)


def split_text_into_keywords(text: str) -> List[str]:
    _wordlist = string_tokenizer(text)
    wordlist = []
    for x in _wordlist:
        if x in WORDS_TO_IGNORE:
            continue
        wordlist.append(x)
    return wordlist


def parse_keyword(text):
    try:
        res = json5.loads(text)
    except Exception:
        return split_text_into_keywords(text)

    import snowballstemmer
    stemmer = snowballstemmer.stemmer('english')

    # json format
    _wordlist = []
    try:
        if 'keywords_zh' in res and isinstance(res['keywords_zh'], list):
            _wordlist.extend([kw.lower() for kw in res['keywords_zh']])
        if 'keywords_en' in res and isinstance(res['keywords_en'], list):
            _wordlist.extend([kw.lower() for kw in res['keywords_en']])
        _wordlist = stemmer.stemWords(_wordlist)
        wordlist = []
        for x in _wordlist:
            if x in WORDS_TO_IGNORE:
                continue
            wordlist.append(x)
        split_wordlist = split_text_into_keywords(res['text'])
        wordlist += split_wordlist
        return wordlist
    except Exception:
        # TODO: This catch is too broad.
        return split_text_into_keywords(text)
