import fitz, re

def extract_bookmarks(pdf_path):
    doc = fitz.open(pdf_path)
    # 如果有自带书签，直接返回
    bookmarks = doc.get_toc()
    if bookmarks:
        doc.close()
        return bookmarks
    bookmarks = []
    # 如果没有自带书签，那么书签前面的数字是可以被提取出来的(否则检测不出来)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text()
        lines = text.split('\n')
        last_line = ""
        for line in lines:
            line = line.strip()


            # 如果能match (\d+\.)*\d+\.?) 后面没有内容, 说明需要和下一行合并
            if re.match(r'((\d+\.)*\d+\.?)', last_line):
                line = last_line + ' ' + line
                # print("last_line", last_line)
                last_line = ""
            else:
                last_line = line

            match = re.match(r'((\d+\.)*\d+\.?)\s+(.*)', line)
            if not match:
                continue
            # 如果match.group(3)中包含". 大写字母", 那么continue
            if re.search(r'\.\s+[A-Z]', match.group(3)):
                continue

            level = len(match.group(1).split('.'))
            # 如果match.group(1).split('.')最后一项是空的，level 减 1
            if match.group(1).split('.')[-1] == '':
                level -= 1
            # title = match.group(3)
            title = line

            if not bookmarks:
                # 如果是第一个标题，必须是 1 或 1. 开头
                if re.match(r'^(1\.?)\s+(.*)', line):
                    bookmarks.append([level, title, page_num])
                continue

            last_level, last_title, _ = bookmarks[-1]
            if level > last_level + 1:
                continue
            elif level == last_level + 1:
                # 如果是上一级的标题，检验对应的前缀，比如 1. 会出现在 1.1. 前面，可以直接按照空格split然后判断
                last_title_split = last_title.split(' ')[0].split('.')[0] + '.1'
                if not title.startswith(last_title_split):
                    continue
                # 输出各个部分match.group(1), match.group(2), match.group(3)
                # print(match.group(1), match.group(2), match.group(3))
                # print("level, title, page_num:", level, title, page_num)
                # print("last_level, last_title:", last_level, last_title)
            elif level == last_level:
                # 如果是同级的标题，检验对应的前缀，比如 1.1. 和 1.2. ，最后一个数字之前的部分应该是一样的，最后一个数字相差 1
                # 先按照空格split，然后按照. split，然后判断最后一项是否相差 1
                last_title_split = last_title.split(' ')[0].split('.')
                title_split = title.split(' ')[0].split('.')
                if len(last_title_split) != len(title_split):
                    continue
                # 删除最后的空字符串 (如果有的话)
                if last_title_split[-1] == '':
                    last_title_split = last_title_split[:-1]
                if title_split[-1] == '':
                    title_split = title_split[:-1]
                # 比较最后一项和前缀
                if last_title_split[:-1] != title_split[:-1] or int(last_title_split[-1]) + 1 != int(title_split[-1]):
                    continue
            elif level < last_level:
                # 如果是上一级的标题，检验对应的前缀，比如 2. 会出现在 1.1. 后面，可以参照上面的方法
                last_title_split = last_title.split(' ')[0].split('.')
                title_split = title.split(' ')[0].split('.')
                if len(last_title_split) != len(title_split) + 1:
                    continue
                # 删除最后的空字符串 (如果有的话)
                if last_title_split[-1] == '':
                    last_title_split = last_title_split[:-1]
                if title_split[-1] == '':
                    title_split = title_split[:-1]
                # 比较前缀
                if int(last_title_split[-2]) + 1 != int(title_split[-1]):
                    continue

            bookmarks.append([level, title, page_num])

    # 去除 title 中的第一项，保持后面的不变
    bookmarks = [[level, ' '.join(title.split(' ')[1:]), page_num] \
                 for level, title, page_num in bookmarks]
    # 去除title中最后的空格(如果有的话)
    bookmarks = [[level, title.rstrip(), page_num] \
                 for level, title, page_num in bookmarks]

    doc.close()
    return bookmarks


def extract_sections_content(pdf_path, section_titles: list) -> dict:
    """
    提取PDF中指定章节的内容。
    参数:
    - doc: fitz.Document，PDF文档对象
    - section_titles: list, 章节标题列表, 格式是[[level, title, page_num], ...]
    返回值:
    - dict: 章节内容字典，键是章节标题，值是章节内容 (需要同时用级别，以及标题的内容还有页码来作为 key (toc的规则)
    """
    doc = fitz.open(pdf_path)
    sections_content = {}
    num_pages = len(doc)

    # 转换为小写并去除空格
    filtered_section_titles = [simplify_string(title[1]) for title in section_titles]

    # 遍历页面和内容，寻找章节标题
    # 章节标题一定会按顺序出现
    current_section_index = 0
    for page_num in range(num_pages):
        page = doc[page_num]
        page_text = page.get_text()
        lines = page_text.split('\n')

        for line in lines:
            line_lower = simplify_string(line)

            # 检查该行是否是章节标题，如果已经找到了所有章节标题，就不再继续
            if current_section_index >= len(filtered_section_titles):
                break

            is_section_title = False
            title = filtered_section_titles[current_section_index]

            # 如果 title 以 line_lower 开头但是不是结尾
            next_line = line
            while title.startswith(line_lower) and not title.endswith(line_lower):
                # 将现在的content_line_lower和下一行合并
                if lines.index(line) + 1 >= len(lines):
                    break
                next_line = lines[lines.index(line) + 1]
                line_lower += simplify_string(next_line)


            # 如果 line_lower 以 title 结尾
            if line_lower.endswith(title):
                # 前面不超过2个字符，那么认为是章节标题
                if len(line_lower) - len(title) <= 2:
                    is_section_title = True
                    line_lower = title
                # 前面的内容满足 [数字.数字.] 其中最后一个. 可以没有, 也认为是章节标题
                else:
                    # 用正则表达式匹配 (\d+\.)*\d+\.?)
                    match = re.match(r'((\d+\.)*\d+\.?)\s+(.*)', line)
                    if match:
                        is_section_title = True
                        line_lower = title


            # 如果 line_lower 中含有 title，并且后面跟一个冒号，并且前面不超过2个字符，那么认为是章节标题
            else:
                with_colon = title + ':'
                if with_colon in line_lower and line_lower.index(with_colon) <= 2:
                    is_section_title = True
                    line_lower = title

            if is_section_title:
                current_section = section_titles[current_section_index]

                # 查找当前章节的内容
                content = []
                have_found_current_line = False
                for content_page_num in range(page_num, num_pages):
                    content_page = doc[content_page_num]
                    content_lines = content_page.get_text().split('\n')

                    for content_line in content_lines:
                        # 如果已经找到当前行，开始收集内容，否则继续查找
                        # 直接比较 content_line 和 line 是不是相等即可
                        if content_line == line:
                            have_found_current_line = True
                        if not have_found_current_line:
                            continue

                        content_line_lower = simplify_string(content_line)


                        # 如果下一个章节标题出现在内容中，停止收集
                        is_next_section_title = False
                        if current_section_index + 1 < len(filtered_section_titles):
                            next_section_title = filtered_section_titles[current_section_index + 1]

                            # 如果 next_section_title 以 content_line_lower 开头但是不是结尾
                            next_line = content_line
                            while next_section_title.startswith(content_line_lower) and not next_section_title.endswith(content_line_lower):
                                # 将现在的content_line_lower和下一行合并
                                if content_lines.index(content_line) + 1 >= len(content_lines):
                                    break
                                next_line = content_lines[content_lines.index(next_line) + 1]
                                content_line_lower += simplify_string(next_line)
                                # print("content_line_lower:", content_line_lower)
                                # print("next_section_title:", next_section_title)


                            # 如果 content_line_lower 以 next_section_title 结尾
                            if content_line_lower.endswith(next_section_title):
                                # 前面不超过2个字符，那么认为是章节标题
                                if len(content_line_lower) - len(next_section_title) <= 2:
                                    is_next_section_title = True
                                # 前面的内容满足 [数字.数字.] 其中最后一个. 可以没有, 也认为是章节标题
                                else:
                                    # 用正则表达式匹配 (\d+\.)*\d+\.?)
                                    match = re.match(r'((\d+\.)*\d+\.?)\s+(.*)', content_line)
                                    if match:
                                        is_next_section_title = True
                            # 如果 content_line_lower 中含有 next_section_title，并且后面跟一个冒号，并且前面不超过2个字符，那么认为是章节标题
                            else:
                                with_colon = next_section_title + ':'
                                if with_colon in content_line_lower and content_line_lower.index(with_colon) <= 2:
                                    is_next_section_title = True
                        if is_next_section_title:
                            # print("line_lower:", line_lower)
                            # print("next_section_title:", next_section_title)
                            # print("")
                            break

                        # 如果发现references或者acknowledgements 停止收集
                        if content_line_lower.startswith("references") or content_line_lower.startswith(
                                "acknowledgements"):
                            break
                        content.append(content_line)
                    else:
                        # 继续收集内容
                        continue
                    break

                # 把current_section从一个list转换为一个tuple
                section_tuple = (current_section[0], current_section[1], current_section[2])
                sections_content[section_tuple] = ' '.join(content).strip()
                current_section_index += 1

    doc.close()
    return sections_content


def get_sections_content(title, sections_content, bookmarks, index=None, page_num=None):
    # 转换key, 将其第一项转换为其在 list 中的索引
    answer = []
    level = -1
    # 把sections_content和bookmarks zip起来
    zip_content = list(zip(sections_content.keys(), bookmarks))
    for i, (key, bookmark) in enumerate(zip_content):
        # 如果simplify_string(title)被包含于simplify_string(content[1])
        if simplify_string(title) in simplify_string(bookmark[1]) and level == -1:
            level = bookmark[0]
            # 去除出现的第一个title前面的内容(不考虑大小写)
            add_content = sections_content[key][sections_content[key].lower().find(title.lower()) + len(title):].strip()
            answer.append(add_content)
        elif level == -1:
            continue
        elif bookmark[0] > level:
            answer.append(sections_content[key])
        elif bookmark[0] == level:
            break

    return answer



def simplify_string(s):
    # 去掉首尾空格，转换为小写，去掉空格，把"’"转化为"'"
    return s.strip().lower().replace(" ", "").replace("’", "'")


def extract_rw(pdf_path):
    bookmarks = extract_bookmarks(pdf_path)
    section_content = extract_sections_content(pdf_path, bookmarks)
    rw = get_sections_content(title='related work', sections_content=section_content, bookmarks=bookmarks)
    return rw[0] if rw else ''



def extract_text_and_links_from_pdf(pdf_path):
    """
    从PDF文件中提取文本和超链接。
    
    :param pdf_path: PDF文件的路径
    :return: 提取的文本和超链接的字典
    """
    text = ""
    links = {}
    
    # 打开PDF文件
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
        
        # 提取超链接
        for link in page.get_links():
            if link.get('uri'):
                # 添加超链接到链接字典
                links[link['uri']] = link['uri']
    
    return text, links


if __name__ == '__main__':
    
    # bookmarks = extract_bookmarks('target7.pdf')
    # print(bookmarks)
    # section_content = extract_sections_content(pdf_path='target7.pdf', section_titles=bookmarks)
    # print(section_content)
    # rw = get_sections_content(title='related work', sections_content=section_content, bookmarks=bookmarks)
    # print(rw)

    print(extract_text_and_links_from_pdf('pdfs/1/source.pdf'))