import os
import re

import requests
from bs4 import BeautifulSoup
import pandas as pd


def crawl_cnki(url):
    # 发送请求
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取标题和作者信息
    titles = soup.find_all('p', class_='title is-5 mathjax')
    authors = soup.find_all('p', class_='authors')
    dates = soup.find_all('span', class_='has-text-black-bis has-text-weight-semibold')
    points = soup.find_all('p', class_='list-title is-inline-block')
    hrefs = []
    for point in points:
        for a in point.find_all('a', href=True):
            hrefs.append(a['href'])
            break
    norm_date = []
    for date in dates:
        norm_date.append(date.text.strip())

    norm_title = []
    for title in titles:
        norm_title.append(title.text.strip())

    norm_author = []
    for author in authors:
        norm_author.append(author.text.strip())

    assert len(norm_author) == len(norm_title)

    for title, author, date, href in zip(norm_title, norm_author, norm_date, hrefs):
        data = {
            "question": "arxiv上发布的论文" + title + "的作者有哪些？",
            "author": author,
            "date": date,
            "download_url": href
        }
        file_path = 'output_papaer.xlsx'
        if not os.path.exists(file_path):
            df = pd.DataFrame(data, index=[0])
            df.to_excel(file_path, index=False)
        else:
            df = pd.read_excel(file_path)
            new_df = pd.DataFrame([data])
            df = pd.concat([df, new_df], ignore_index=True)
            df.to_excel(file_path, index=False)


# 示例URL
url = 'https://arxiv.org/search/?query=segmentation&searchtype=all&source=header'

# 爬取数据
crawl_cnki(url)
