from .extractor import *
from .openai import get_chat_completion
from .prompts import CLEAR_LAST_REF_ENTRY, CLEAR_RELATED_WORK
import fitz

def process_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    
    # get the directory
    section_titles = extract_section_titles(doc)
    main_body_ind = -1
    if re.search(r'\d', section_titles[0][1]):
        # the bookmark has number index, so we can drop the appendix
        for ind, unit in enumerate(section_titles):
            if unit[1].strip()[0].isalpha():
                main_body_ind = ind-1
                if main_body_ind > 0:
                    main_body_title = section_titles[ : main_body_ind]
                    break
    else:
        for ind, unit in enumerate(section_titles):
            if "appendix" in unit[1].lower().replace(" ", ""):
            # if Appendix is in the bookmark
                main_body_ind = ind-1
                if main_body_ind > 0:
                    main_body_title = section_titles[ : main_body_ind]
                    break
    if main_body_ind == -1:
        main_body_title = section_titles
    title_str = " ".join([unit[1].lower() for unit in main_body_title])
    assert 'related work' in title_str, "No related work part in this paper."
    
    # locate the reference, the related work should be before reference
    references, is_APA, start_page = extract_references(doc, section_titles)
    if len(references[-1]) > 1000:
        if len(references[-1]) > 2000:
            references[-1] = references[-1][:2000]
        last_entry = get_chat_completion(messages=[
            {
                "role": "user",
                "content": CLEAR_LAST_REF_ENTRY+references[-1]
            }
        ])
        if not last_entry == 'True':
            if last_entry == 'False':
                references = references[:-1]
            else:
                references[-1] = last_entry
    assert references, "Cannot get references from PDF"
    assert is_APA, "Cannot get citation format information from PDF"

    # get related work
    try:
        rw_content = extract_related_work(doc[ :start_page+1], section_titles)
        assert len(rw_content) > 100, "Try to get related work, but get wrong content."
        related_work = get_chat_completion(messages=[
            {
                "role": "user",
                "content": CLEAR_RELATED_WORK+' '.join(rw_content)
            }
        ])
        if not related_work:
            related_work = ' '.join(rw_content)
    except:
        raise ValueError("Cannot get the content of related work from PDF")


    temp_res = {
        "isAPA": is_APA,
        "related work": related_work,
        "reference": references,
    }
    return temp_res