import fitz
import re


def extract_bookmarks(pdf_path):
    doc = fitz.open(pdf_path)
    # 如果有自带书签，直接返回
    bookmarks = doc.get_toc()
    if bookmarks:
        doc.close()
        return bookmarks
    bookmarks = []
    # 如果没有自带书签，那么书签前面的数字是可以被提取出来的(否则检测不出来)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text()
        lines = text.split('\n')
        last_line = ""
        for line in lines:
            line = line.strip()


            # 如果能match (\d+\.)*\d+\.?) 后面没有内容, 说明需要和下一行合并
            if re.match(r'((\d+\.)*\d+\.?)', last_line):
                line = last_line + ' ' + line
                # print("last_line", last_line)
                last_line = ""
            else:
                last_line = line

            match = re.match(r'((\d+\.)*\d+\.?)\s+(.*)', line)
            if not match:
                continue
            # 如果match.group(3)中包含". 大写字母", 那么continue
            if re.search(r'\.\s+[A-Z]', match.group(3)):
                continue

            level = len(match.group(1).split('.'))
            # 如果match.group(1).split('.')最后一项是空的，level 减 1
            if match.group(1).split('.')[-1] == '':
                level -= 1
            # title = match.group(3)
            title = line

            if not bookmarks:
                # 如果是第一个标题，必须是 1 或 1. 开头
                if re.match(r'^(1\.?)\s+(.*)', line):
                    bookmarks.append([level, title, page_num])
                continue

            last_level, last_title, _ = bookmarks[-1]
            if level > last_level + 1:
                continue
            elif level == last_level + 1:
                # 如果是上一级的标题，检验对应的前缀，比如 1. 会出现在 1.1. 前面，可以直接按照空格split然后判断
                last_title_split = last_title.split(' ')[0].split('.')[0] + '.1'
                if not title.startswith(last_title_split):
                    continue
                # 输出各个部分match.group(1), match.group(2), match.group(3)
                # print(match.group(1), match.group(2), match.group(3))
                # print("level, title, page_num:", level, title, page_num)
                # print("last_level, last_title:", last_level, last_title)
            elif level == last_level:
                # 如果是同级的标题，检验对应的前缀，比如 1.1. 和 1.2. ，最后一个数字之前的部分应该是一样的，最后一个数字相差 1
                # 先按照空格split，然后按照. split，然后判断最后一项是否相差 1
                last_title_split = last_title.split(' ')[0].split('.')
                title_split = title.split(' ')[0].split('.')
                if len(last_title_split) != len(title_split):
                    continue
                # 删除最后的空字符串 (如果有的话)
                if last_title_split[-1] == '':
                    last_title_split = last_title_split[:-1]
                if title_split[-1] == '':
                    title_split = title_split[:-1]
                # 比较最后一项和前缀
                if last_title_split[:-1] != title_split[:-1] or int(last_title_split[-1]) + 1 != int(title_split[-1]):
                    continue
            elif level < last_level:
                # 如果是上一级的标题，检验对应的前缀，比如 2. 会出现在 1.1. 后面，可以参照上面的方法
                last_title_split = last_title.split(' ')[0].split('.')
                title_split = title.split(' ')[0].split('.')
                if len(last_title_split) != len(title_split) + 1:
                    continue
                # 删除最后的空字符串 (如果有的话)
                if last_title_split[-1] == '':
                    last_title_split = last_title_split[:-1]
                if title_split[-1] == '':
                    title_split = title_split[:-1]
                # 比较前缀
                if int(last_title_split[-2]) + 1 != int(title_split[-1]):
                    continue

            bookmarks.append([level, title, page_num])

    # 去除 title 中的第一项，保持后面的不变
    bookmarks = [[level, ' '.join(title.split(' ')[1:]), page_num] \
                 for level, title, page_num in bookmarks]
    # 去除title中最后的空格(如果有的话)
    bookmarks = [[level, title.rstrip(), page_num] \
                 for level, title, page_num in bookmarks]

    doc.close()
    return bookmarks



