import requests
from bs4 import BeautifulSoup


def download_arxiv_pdf(arxiv_id, save_path):
    # 构造PDF下载链接
    pdf_url = f"http://arxiv.org/pdf/{arxiv_id}"
    
    try:
        # 发送HTTP请求以获取PDF文件
        response = requests.get(pdf_url)
        response.raise_for_status()  # 如果请求失败，抛出异常
        
        # 将PDF内容写入文件
        with open(save_path, 'wb') as file:
            file.write(response.content)
        
        print(f"PDF downloaded successfully and saved to {save_path}")
    
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"Other error occurred: {err}")



def extract_arxiv_paper_info(html_content, source=True):
    # 解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 提取arxiv id
    arxiv_id = soup.find('meta', {'name': 'citation_arxiv_id'})['content']
    
    # 提取标题
    title_tag = soup.find('meta', {'name': 'citation_title'})
    title = title_tag['content'] if title_tag else 'No title found'
    
    # 提取摘要
    abstract_tag = soup.find('blockquote', {'class': 'abstract'})
    abstract = abstract_tag.get_text(strip=True) if abstract_tag else 'No abstract found'
    abstract = abstract[len('Abstract:'):].strip() if abstract.startswith('Abstract:') else abstract
    
    # 提取相关工作
    related_work = 'Related work section extraction requires specific HTML structure knowledge,'
    # arxiv pages may not always have a clear related work section
    
    # 提取提交日期
    # submission_date_tag = soup.find('div', {'class': 'submission-history'})
    # submission_date = submission_date_tag.get_text(strip=True) if submission_date_tag else 'No submission date found'
    submission_date_tag = soup.find('meta', {'name': 'citation_date'})
    submission_date = submission_date_tag['content'] if submission_date_tag else 'No submission date found'

    # 返回提取的信息
    return {
        'arxiv_id': arxiv_id,
        'title': title,
        'abstract': abstract,
        'date': submission_date
        # 'related_work': related_work,
        # 'submission_date': submission_date
    } if source else {
        'arxiv_id': arxiv_id,
        'title': title,
        'date': submission_date
    }












if __name__ == '__main__':
    # json_file_path = 'topic_paper_db.json'
    # arxiv_urls = {
    # '1': '2201.11903',  # 替换为实际的arXiv ID
    # '2': '2201.11904',  # 替换为实际的arXiv ID
    # # 添加其他target papers的arXiv ID
    # }
    # update_json_with_arxiv_info(json_file_path, arxiv_urls)


    # 示例用法
    arxiv_id = "2405.10548"  # 替换为实际的arXiv ID
    save_path = f"pdfs/1/source.pdf"  # 替换为实际的保存路径

    download_arxiv_pdf(arxiv_id, save_path)



