import fitz

from utils import simplify_string, check_if_section_title


def extract_sections_content(pdf_path, section_titles: list) -> dict:
    """
    提取PDF中指定章节的内容。
    参数:
    - doc: fitz.Document，PDF文档对象
    - section_titles: list, 章节标题列表, 格式是[[level, title, page_num], ...]
    返回值:
    - dict: 章节内容字典，键是章节标题，值是章节内容 (需要同时用级别，以及标题的内容还有页码来作为 key (toc的规则)
    """
    doc = fitz.open(pdf_path)
    sections_content = {}
    num_pages = len(doc)

    # 转换为小写并去除空格
    filtered_section_titles = [simplify_string(title[1]) for title in section_titles]

    # 预处理每一页，先合并一些可能是标题的行

    # 根据页面和行数来定位
    is_section_title = {}

    # 遍历页面和内容，寻找章节标题
    # 章节标题一定会按顺序出现
    current_section_index = 0
    for page_num in range(num_pages):
        page = doc[page_num]
        page_text = page.get_text()
        lines = page_text.split('\n')

        for i, line in enumerate(lines):
            line_lower = simplify_string(line)

            # 检查该行是否是章节标题，如果已经找到了所有章节标题，就不再继续
            if current_section_index >= len(filtered_section_titles):
                break

            title = filtered_section_titles[current_section_index]

            # 如果 title 以 line_lower 开头但是不是结尾
            next_index = i + 1
            while title.startswith(line_lower) and not title.endswith(line_lower):
                # 将现在的content_line_lower和下一行合并
                if next_index >= len(lines):
                    break
                next_line = lines[next_index]
                next_index += 1
                line_lower += simplify_string(next_line)

            is_section_title = check_if_section_title(line_lower, title)[0]

            if is_section_title:
                current_section = section_titles[current_section_index]

                # 查找当前章节的内容
                content = []
                have_found_current_line = False
                for content_page_num in range(page_num, num_pages):
                    content_page = doc[content_page_num]
                    content_lines = content_page.get_text().split('\n')
                    has_found_next_section_title = False

                    for content_line_index, content_line in enumerate(content_lines):
                        # 如果已经找到当前行，开始收集内容，否则继续查找
                        # 直接比较 content_line 和 line 是不是相等即可
                        if content_line == line:
                            have_found_current_line = True
                            # continue
                        if not have_found_current_line:
                            continue

                        content_line_lower = simplify_string(content_line)

                        # 如果下一个章节标题出现在内容中，停止收集

                        next_section_title = None
                        if current_section_index + 1 < len(filtered_section_titles):
                            next_section_title = filtered_section_titles[current_section_index + 1]

                            # 如果 next_section_title 以 content_line_lower 开头但是不是结尾
                            next_line = content_line
                            next_index = content_line_index
                            while next_section_title.startswith(content_line_lower) and not next_section_title.endswith(
                                    content_line_lower):
                                # 将现在的content_line_lower和下一行合并
                                if next_index + 1 >= len(content_lines):
                                    break

                                next_index += 1
                                next_line = content_lines[next_index]
                                content_line_lower += simplify_string(next_line)

                        if next_section_title is None:
                            has_found_next_section_title = (check_if_section_title(content_line_lower, "reference")[0]
                                                            or check_if_section_title(content_line_lower, "references")[
                                                                0]
                                                            or check_if_section_title(content_line_lower,
                                                                                      "acknowledgement")[0]
                                                            or check_if_section_title(content_line_lower,
                                                                                      "acknowledgements")[0]
                                                            or check_if_section_title(content_line_lower, "appendix")[
                                                                0])
                        else:
                            has_found_next_section_title = \
                            check_if_section_title(content_line_lower, next_section_title)[0]

                        if has_found_next_section_title:
                            # print("have_found_next_section_title")
                            # print("line_lower:", line_lower)
                            # print("next_section_title:", next_section_title)
                            # print("content:", content_line)
                            # print("")
                            # print("content:", content)
                            break

                        # 如果发现references或者acknowledgements 停止收集
                        if content_line_lower.startswith("references") or content_line_lower.startswith(
                                "acknowledgements"):
                            break

                        content.append(content_line)

                    if has_found_next_section_title:
                        # print("current_section_index:", current_section_index)
                        break

                # 把current_section从一个list转换为一个tuple
                section_tuple = (current_section[0], current_section[1], current_section[2])
                sections_content[section_tuple] = ' '.join(content).strip()
                # 如果section不以.结尾, 那么除去最后一个找到的.后面的内容, 如 a apple. 8 -> a apple.
                if sections_content[section_tuple] and sections_content[section_tuple][-1] != '.':
                    sections_content[section_tuple] = sections_content[section_tuple][
                                                      :sections_content[section_tuple].rfind('.') + 1]

                current_section_index += 1

    doc.close()
    return sections_content


def get_sections_content(title, sections_content, bookmarks, index=None, page_num=None):
    # 转换key, 将其第一项转换为其在 list 中的索引
    answer = []
    level = -1
    # 把sections_content和bookmarks zip起来
    zip_content = list(zip(sections_content.keys(), bookmarks))
    for i, (key, bookmark) in enumerate(zip_content):
        # 如果simplify_string(title)被包含于simplify_string(content[1])
        if simplify_string(title) in simplify_string(bookmark[1]) and level == -1:
            level = bookmark[0]
            # 去除出现的第一个title前面的内容(不考虑大小写)
            add_content = sections_content[key][sections_content[key].lower().find(title.lower()) + len(title):].strip()
            answer.append(add_content)
        elif level == -1:
            continue
        elif bookmark[0] > level:
            answer.append(sections_content[key])
        elif bookmark[0] == level:
            break

    return answer
