import requests
from bs4 import BeautifulSoup
import json


proxies = {
    'http': 'http://10.16.4.198:7890',
    'https': 'http://10.16.4.198:7890',
}

def get_info_from_arxivhtml(arxiv_id):
    arxiv_html_url = f"https://arxiv.org/html/{arxiv_id}"
    for i in range(3):
        try:
            proxies = {
                'http': 'http://10.16.4.198:7890',
                'https': 'http://10.16.4.198:7890',
            }
            _html_file = requests.get(arxiv_html_url, proxies=proxies)
            if _html_file.status_code == 200:
                break
        except Exception as e:
            if i==2:
                raise
            print(e)
    
    if _html_file.status_code == 404:
        raise ValueError(f"cannot get html file for {arxiv_id} directly from arxiv")
        
    html_file = _html_file.content
    soup = BeautifulSoup(html_file, 'html.parser')
    
    # get bookmark and check this paper contains related work
    hasrw = False
    bookmarks = soup.find_all('span', class_='ltx_text ltx_ref_title')
    for bookmark in bookmarks:
        title = bookmark.get_text()
        if "related work" in title.lower():
            rw_title = title
            hasrw = True
            break
    if bookmarks:
        assert hasrw, "No related work in this paper"
    else:
        ValueError("No bookmark in this html. Maybe an old version...")
    
    # get related work and reference
    rw_content = soup.find('section', id=f'S{rw_title[0]}')
    paragraphs = rw_content.find_all('p', class_='ltx_p')
    paragraph_texts = [p.get_text(strip=True) for p in paragraphs]
    related_work_without_subtitles = '\n'.join(paragraph_texts)
    
    _references = soup.find_all('li', class_="ltx_bibitem")
    references = []
    for _ref in _references:
        ref = _ref.find_all('span', class_='ltx_bibblock')
        ref_content = ' '.join([r.get_text(strip=True) for r in ref])
        references.append(ref_content)

    # check is APA format
    ref0 = soup.find('span', class_="ltx_tag ltx_tag_bibitem")
    if ref0:
        isAPA = False
    else:
        isAPA = True

    assert related_work_without_subtitles, "Failed to get related work content"
    assert references, "Failed to get references"
        
    res = {
        "isAPA": isAPA,
        "related work": related_work_without_subtitles,
        "reference": references
    }
        
    return res

def get_info_from_ar5xivhtml(arxiv_id):
    ar5iv_html_url = f"https://ar5iv.org/abs/{arxiv_id}"

    for i in range(3):
        try:
            _html_file = requests.get(ar5iv_html_url)
            if _html_file.status_code == 200:
                break
        except Exception as e:
            if i==2:
                raise
            print(e)

    if _html_file.url == f"https://arxiv.org/abs/{arxiv_id}" or _html_file.status_code == 404:
        raise ValueError(f"cannot get html file for {arxiv_id}")   
    html_file = _html_file.content
    soup =  BeautifulSoup(html_file, 'html.parser')     
    sections = soup.find_all('section', class_="ltx_section")
    hasrw = False
    for section in sections:
        title = section.find('h2').get_text()
        if "related work" in title.lower():
            hasrw = True
            section_id = section.get('id')
            _paragraphs = section.find_all('div')
            paragraphs = []
            for _p in _paragraphs:
                if section_id in _p.get('id'):
                    paragraphs.append(_p)

            related_work = ""
            for para in paragraphs:
                para.find_all('p', class_='ltx_p')
                para_content = ' '.join([p.get_text(strip=True) for p in para])
                related_work += para_content.strip()
            break
    assert hasrw, "No related work in this paper"
    _references = soup.find_all('li', class_="ltx_bibitem")
    references = []
    for _ref in _references:
        ref = _ref.find_all('span', class_="ltx_bibblock")
        ref_content = ' '.join([r.get_text(strip=True).replace("\n", " ") for r in ref])
        if ref_content.strip():
            references.append(ref_content.strip())

    ref0 = soup.find('span', class_="ltx_tag ltx_tag_bibitem")
    if ref0:
        isAPA = False
    else:
        isAPA = True
        
    assert related_work, "Failed to get related work content"
    assert references, "Failed to get references"

    res = {
        "isAPA": isAPA,
        "related work": related_work,
        "reference": references
    }
    
    return res
            
def get_info_from_html(arxiv_id):
    try:
        res = get_info_from_arxivhtml(arxiv_id)
    except Exception as e:
        print(e)
    res = get_info_from_ar5xivhtml(arxiv_id) 
    return res

if __name__ == "__main__":
    try:
        res = get_info_from_html("2306.11027")
        with open('./extraction.json', 'w', encoding='utf-8') as f:
            json.dump(res, f, indent=4)
    except Exception as e:
        print(e)
