
import requests
from bs4 import BeautifulSoup

# start for page item. 10个/页
def search_google_scholar(keyword, start=0, articles = []):
    """
    搜索Google学术上指定关键词的文章标题和摘要。

    Args:
        keyword (str): 要搜索的关键词。

    Returns:
        list: 包含文章标题和摘要的字典列表。
              每个字典包含 'title' 和 'abstract' 键。
              如果无法获取摘要，'abstract' 将为 None。
    """
    if start > 0:
        base_url = f"https://scholar.google.com/scholar?start={start}&q="
    else:
        base_url = f"https://scholar.google.com/scholar?q="
    search_url = base_url + requests.utils.quote(keyword) # 对关键词进行URL编码

    # 模拟浏览器请求头，避免被Google学术识别为爬虫
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()  # 检查HTTP请求是否成功

        soup = BeautifulSoup(response.text, 'html.parser')

        # 查找所有文章结果
        # Google学术的文章结果通常在 class 为 'gs_r gs_or gs_scl' 的 div 中
        for article_div in soup.find_all('div', class_='gs_r gs_or gs_scl'):
            title_tag = article_div.find('h3', class_='gs_rt')
            abstract_tag = article_div.find('div', class_='gs_rs')

            title = title_tag.a.get_text(strip=True) if title_tag and title_tag.a else "N/A"
            url = title_tag.a.get('href')  if title_tag and title_tag.a else "N/A"
            abstract = abstract_tag.get_text(strip=True) if abstract_tag else None

            # 清理摘要中的 "..." 前缀，如果存在的话
            if abstract and abstract.startswith("... "):
                abstract = abstract[4:]

            articles.append({'title':title,'url':url })

    except requests.exceptions.RequestException as e:
        print(f"请求错误: {e}")
    except Exception as e:
        print(f"解析错误: {e}")

    # print(articles)
    return articles

if __name__ == "__main__":
    import time
    import random

    search_keyword = """ "foundational model" + "scientific hypothesis generation" + "biases" """
    articles = []
    for start in [0, 10, 20, 30]:
        articles = search_google_scholar(search_keyword, start, articles)
        print(articles)
        if len(articles) < start +10:
            print('Not enough articles')
            break
        time.sleep(random.randint(20, 40))

    print(articles)
    print("\n".join(([ii['title'] for ii in articles])))