import requests
from bs4 import BeautifulSoup
import os
import json
from playwright.sync_api import sync_playwright

path_article = './scienceArticle/'
path_editorial = './scienceEditorial/'
path_cover = './scienceCover/'
proxies = {
  'http': '',
  'https': '',
}
kv = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
if not os.path.exists(path_article):
    os.makedirs(path_article)
if not os.path.exists(path_editorial):
    os.makedirs(path_editorial)
if not os.path.exists(path_cover):
    os.makedirs(path_cover)

# start and end volume
start_year = 2023
end_year = 2009
# nature_url = 'https://www.nature.com/ng/volumes/' # nature genetics
science_url = 'https://www.science.org/loi/science/group/'  # nature



def request_web(year_url):
    # 创建一个 Playwright 实例
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        }
        page.set_extra_http_headers(headers)
        # 导航到网站
        page.goto(year_url,)
        # 等待页面加载完成
        page.wait_for_timeout(5000)
        page_content = page.content()
        browser.close()
    return page_content


def request_web_pic(url,path):
    # 创建一个 Playwright 实例
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        }
        page.set_extra_http_headers(headers)
        # 导航到网站
        page.goto(url,)
        # 等待页面加载完成
        page.wait_for_timeout(5000)
        page.screenshot(path=path)
        browser.close()
    return 

# start the loop for each volume and download editorials in each volume
while start_year >= end_year:
    # 从年份界面获取每个issue网页的链接
    try:
        year_url = science_url + 'd'+str(start_year//10*10)+'.y'+ str(start_year)
        print(year_url)
        page_content = request_web(year_url)
    except Exception as e:
        print(str(start_year) + " missing", e)

    issue_url_soup = BeautifulSoup(page_content, 'html.parser')
    issue_urls = issue_url_soup.find_all('div', class_='col-12 col-sm-3 col-lg-2 mb-4 mb-sm-3')
    issue_urls_list = []
    for issue_url in issue_urls:
        issue_urls_list.append(issue_url.find('a')['href'])

    # 从issue界面下载图片、article链接与editorial
    for issue_url in issue_urls_list[:]:
        numbers = issue_url.split("/")[3:]
        volume = numbers[0]
        issue = numbers[1]
        filename = volume + "_" + issue 

        try:
            issue_url = 'https://www.science.org' + issue_url
            print(issue_url)
            page_content = request_web(issue_url)
            issue_soup = BeautifulSoup(page_content, 'html.parser')
        except Exception as e:
            print(filename + " missing", e)

        # download cover
        try:
            cover_url = 'https://www.science.org' + issue_soup.find('div', class_='cover-image__image').find('img').get('src')
            path = path_cover + filename + '.png'
            img_response = request_web_pic(cover_url, path)
            # with open(path_cover + filename + '.png', 'wb') as f:
            #     f.write(img_response)
        except Exception as e:
            print(filename + " cover missing", e)
            open(path_cover + filename + '_failed' + '.png', 'wb')

                

        # download editorials
        try:
            article_source = editorial = issue_soup.find('div', class_='text-ellipses collapse truncated')
            editorial = editorial.get_text()
            with open(path_editorial + filename + '.txt', 'w',encoding='utf-8') as f:
                f.write(editorial)
            print(filename + " editorial saved")
        except Exception as e:
            print(filename + " editorial missing", e)

        # download articles
        try:
            article_source = editorial = issue_soup.find('div', class_='text-ellipses collapse truncated')
            article_urls = article_source.find_all('a')
            article_urls_list = []
            for url in article_urls:
                article_urls_list.append(url)
                print(url)
            with open(path_article + filename + '.txt', 'w') as f:
                for article_url in article_urls_list:
                    f.write(str(article_url['href']) + '\n')
            print(filename + " article saved")
        except Exception as e:
            print(filename + " article missing", e)
        
    start_year -= 1