import requests
from bs4 import BeautifulSoup
import os
import re
from playwright.sync_api import sync_playwright
import openai
import pandas as pd
import logging
from tqdm import tqdm
import wandb
import time
from functools import wraps
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from bert_score import score
import json

# wandb.login(key="75c71a00697e97575abad4cafddb5cfc37de3305")
# wandb.init(project="CNS_cover", name="ACS-Spider")


# proxy_host = "client.shinjikun.net"
# proxy_port = "30003"
# proxy_password = "30439ed5-ec66-4d2e-bc8b-69ccf069f525"

# proxy_url = f"trojan://:{proxy_password}@{proxy_host}:{proxy_port}"

# proxies = {
#     'http': proxy_url,
#     'https': proxy_url
# }

# proxies = {
#   'http': '',
#   'https': '',
# }

kv = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'cache-control': 'max-age=0',
    # 'cookie': 'MAID=w/usEcGEvtLzdlCOhQS6iQ==; visid_incap_2209364=286vqbY7TcC/ra0EBv+x9VexYmcAAAAAQUIPAAAAAABWHBI786I/w8MLtuWgjRBx; ACSEnt=537846_6105_1739878040544; SnapABugHistory=3#; affPopupAlreadySeen=true; incap_ses_4556_2209364=+js8R/9H0hoTL0Bh6Ck6P3CStWcAAAAAKTlwge+FVrH+m/JM0Wytfg==; ACSPubs2=ZXJyb3I6IDEyNy4wLjAuMQ%3D%3D; incap_ses_171_2209364=wFELbgqIIHrqjOIy3oNfAoTOtmcAAAAAgDSvUCEi1pP1P/byggRsDQ==; incap_ses_173_2209364=TtFaK8F9UkDvlK3Z9J5mAqJvuWcAAAAAfXrV9haipsC+pEIxjDuiXw==; incap_ses_135_2209364=NJv/KWQnzBiPHgg07p3fAc5zuWcAAAAARDq3aXmUN+zT9HL8S71ZWA==; nlbi_2209364=4xd/PBoSOTH+geye/VWePAAAAAANqrozLPyLhpKAVDC1/Zkx; MACHINE_LAST_SEEN=2025-02-22T00%3A44%3A03.418-08%3A00; JSESSIONID=B88727FFB3DDEE5AAE56E2347EB561E4; cf_clearance=8Zvcl8yfQKmG4_4.jfLf7EZ3JWNJeOnVejK1nreCvPw-1740213844-1.2.1.1-dQ1qCFz.0Tpy_gZ15isrxlYSnMA2cWNBnIHr7w11PgtYDaq_qqBZkP.7OEQV3.r8G0kFgYcSSheFBjQzZ0ERHhngZ5VHhAd70m_kOipqnKQxkgxVmiRZXjjd90hRdYpRhl_dpL1AK.XSI1Ld0UEK4Clw5NEIyP4v_D1GFfl93h_8LIiL3G56hQ4CLgywBgmhXaf6YBre3rQCOKOxpt2qfGxhWqe0RZWRhKnYUwMRQZ5Kyt3J_jyDd8oerkkgSiwUrr0zbc33.us3N.hh5QdOopLiuENni353sryoasYyK.4; SnapABugUserAlias=%23; SnapABugVisit=453#1739878044',
    'priority': 'u=0, i',
    'referer': 'https://pubs.acs.org/',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'sec-ch-ua-arch': '"arm"',
    'sec-ch-ua-bitness': '"64"',
    'sec-ch-ua-full-version': '"131.0.6778.265"',
    'sec-ch-ua-full-version-list': '"Google Chrome";v="131.0.6778.265", "Chromium";v="131.0.6778.265", "Not_A Brand";v="24.0.0.0"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-model': '""',
    'sec-ch-ua-platform': '"macOS"',
    'sec-ch-ua-platform-version': '"14.4.0"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}

headers2 = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'cache-control': 'max-age=0',
    # 'cookie': 'MAID=w/usEcGEvtLzdlCOhQS6iQ==; visid_incap_2209364=286vqbY7TcC/ra0EBv+x9VexYmcAAAAAQUIPAAAAAABWHBI786I/w8MLtuWgjRBx; ACSEnt=537846_6105_1739878040544; SnapABugHistory=3#; affPopupAlreadySeen=true; nlbi_2209364=mBjsCOfmmEqaim9Y/VWePAAAAADCWYahhzHZCA5DS9euqQdz; incap_ses_4556_2209364=+js8R/9H0hoTL0Bh6Ck6P3CStWcAAAAAKTlwge+FVrH+m/JM0Wytfg==; ACSPubs2=ZXJyb3I6IDEyNy4wLjAuMQ%3D%3D; incap_ses_171_2209364=wFELbgqIIHrqjOIy3oNfAoTOtmcAAAAAgDSvUCEi1pP1P/byggRsDQ==; incap_ses_173_2209364=DaYSELtRy32CGqjQ9J5mAsspt2cAAAAAk4a7OLywxv36V5yAuurvfA==; SnapABugRef=https%3A%2F%2Fpubs.acs.org%2Floi%2Fachre4%2Fgroup%2Fd2020.y2025%20; MACHINE_LAST_SEEN=2025-02-21T22%3A30%3A36.420-08%3A00; JSESSIONID=D27BA767DF24510FD70767FCA1125F1D; __cf_bm=w4652_aY0phTu5nwRlf3jArkI2YD9kn8ko.qx9FJw38-1740205836-1.0.1.1-4PWaU8vzr_z5q5mRxxguUBjma4a378RhKyzigS54mNTsfeZ1angKxFPU7ikwa26g21SpscNpcj3G.z.QzE_rTg; cf_clearance=P8GWelp5a8Sgtofoju6bclEUBeFqATK_l.i.s42mZrs-1740205836-1.2.1.1-1I83JC5kiKyZgPpLoK7cRWBxDKbI8pegKhNhpNPbpYPR5kvoj45RTZMekW.A0scZ0AyDsnF5R3Vs.OSwetyQtoNSxqRJ1p.QRfnYybWi8fPNxpUV9OiBw8vV_J7yQNPuVb3XuOOsALlASOdSKzQdLL8F24QEujK0moLPACBcQO6Yte7NEWvTDr0kZknnPqbASaivIyb5QWBQtClQPcM1Ul2hJT4uOr.egG1sB87ZSfPtPIdPB44G2G2DSV9OK7YacWP49wX6YJBesw70pPbH47ZefdExSJJvYIxq9WSjfCY; SnapABugUserAlias=%23; SnapABugVisit=443#1739878044',
    # 'priority': 'u=0, i',
    'referer': 'https://pubs.acs.org/',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'sec-ch-ua-arch': '"arm"',
    'sec-ch-ua-bitness': '"64"',
    'sec-ch-ua-full-version': '"131.0.6778.265"',
    'sec-ch-ua-full-version-list': '"Google Chrome";v="131.0.6778.265", "Chromium";v="131.0.6778.265", "Not_A Brand";v="24.0.0.0"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-model': '""',
    'sec-ch-ua-platform': '"macOS"',
    'sec-ch-ua-platform-version': '"14.4.0"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}

# headers2 = {
#     'sec-ch-ua-full-version-list': '"Google Chrome";v="131.0.6778.265", "Chromium";v="131.0.6778.265", "Not_A Brand";v="24.0.0.0"',
#     'sec-ch-ua-platform': '"macOS"',
#     # 'cookie': 'MAID=w/usEcGEvtLzdlCOhQS6iQ==; visid_incap_2209364=286vqbY7TcC/ra0EBv+x9VexYmcAAAAAQUIPAAAAAABWHBI786I/w8MLtuWgjRBx; ACSEnt=537846_6105_1739878040544; SnapABugHistory=3#; affPopupAlreadySeen=true; nlbi_2209364=mBjsCOfmmEqaim9Y/VWePAAAAADCWYahhzHZCA5DS9euqQdz; incap_ses_4556_2209364=+js8R/9H0hoTL0Bh6Ck6P3CStWcAAAAAKTlwge+FVrH+m/JM0Wytfg==; ACSPubs2=ZXJyb3I6IDEyNy4wLjAuMQ%3D%3D; incap_ses_171_2209364=wFELbgqIIHrqjOIy3oNfAoTOtmcAAAAAgDSvUCEi1pP1P/byggRsDQ==; incap_ses_173_2209364=DaYSELtRy32CGqjQ9J5mAsspt2cAAAAAk4a7OLywxv36V5yAuurvfA==; MACHINE_LAST_SEEN=2025-02-21T20%3A50%3A21.347-08%3A00; JSESSIONID=69BFFD694D7180729B3437ACC8260956; __cf_bm=YM6LZoOtvenPu1y4vqSbe74.cheKyNO2JDd.vYeWsRg-1740199821-1.0.1.1-ljs7ddEGGt10jBUIHmfFnbLHhDeGZa7dUNdKogr2gMKMaK4fcbUdS_3r41Ywo14M.JaVQHqmAfAdx7X4wC4CoQ; cf_clearance=CpZkol83uWskRPD._5olphj5Z96Nm.AXhKncRea3A2U-1740199822-1.2.1.1-Uvjl8OCDPx6raYXmIwTujyoIVmdPG62H6R.31bdxmhc2UyVjV8Nqxnaxy0tvaa5gGv.SBF9aYz.ilx5ONE6cDdi2gzCFpGKpnbBSeLmR.wQEL887GLX3NgxTTYgn09hUKmzOpW6MxbG.2xbnugQ3D3efiTrbVIj98_YaVcdJvpOTnb_iyraYwV4e5bR0xvI6JKtD68yclpZ_utZHGlSfVykrQ6MvSrvGQOBa6zfv7ZLC6EKHmdom.900RpplFhelvvseEEEKmlEpPYLptP94hTkuW17IEzUTOB06QM5f_A0; SnapABugRef=https%3A%2F%2Fpubs.acs.org%2Floi%2Fachre4%2Fgroup%2Fd2020.y2025%20; SnapABugUserAlias=%23; SnapABugVisit=438#1739878044',
#     'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
#     'sec-ch-ua-bitness': '"64"',
#     'sec-ch-ua-model': '""',
#     'sec-ch-ua-mobile': '?0',
#     'sec-ch-ua-arch': '"arm"',
#     'sec-ch-ua-full-version': '"131.0.6778.265"',
#     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
#     'sec-ch-ua-platform-version': '"14.4.0"',
# }

logger = logging.getLogger('journal_processor')


def retry_on_failure(max_retries=3, delay=2):

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            retries = 0
            last_error = None
            
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    last_error = e
                    retries += 1
                    if retries < max_retries:
                        print(f"Error: {str(e)}. Retrying {retries}/{max_retries} in {delay} seconds...")
                        time.sleep(delay)
            
            print(f"Failed after {max_retries} retries: {str(last_error)}")
            raise last_error
            
        return wrapper
    return decorator

def read_journal_data(csv_path: str) -> list:

    df = pd.read_csv(csv_path)


    df['Journal'] = df['Journal'].str.strip()
    df['Link'] = df['Link'].str.strip()

    cell_journals = []

    for _, row in df.iterrows():
        journal = row['Journal']
        link = str(row['Link'])  
        number = row['Number']

        if isinstance(link, str) and 'pubs.acs.org' in link:
            cell_journals.append({
                'journal': journal,
                'link': link,
                'number': number
            })
    
    return cell_journals

def setup_logger(log_file: str = "./debug/journal_processing.log") -> None:

    os.makedirs(os.path.dirname(log_file), exist_ok=True)

    logger.setLevel(logging.INFO)
    

    if logger.handlers:
        logger.handlers.clear()

    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.INFO)

    formatter = logging.Formatter('%(asctime)s - %(message)s')
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)


def llm_call(question: str) -> str:
    client = openai.OpenAI(base_url="xxx",api_key="sk-xxx")
    systemPrompt = """# Requirement
You are a text comparison and selection expert. Please think step by step. 
- First, extract the names from the editorial. In the editorial, the name that typically appears is the author's last name. For example, the full name in article "Tomi K. Baikie" might appear in the form of "Baikie et al." in the editorial.
- Then search for the corresponding names among the authors in the articles and present the full name of the author in the article. The full name MUST have the name you extracted from the editorial.
- And then select the href of the article which hasthe name you extracted in the "Author names" part, analyze the similarity between its abstract and the editorial, and provide your analysis. If you find multiple articles with the name, please return all the hrefs.
- Finally, return the href of the article you selected. The href is in the format of 'https://www.cell.com/cell/fulltext/S0092-8674(23)00979-0'.
- If you cannot find the corresponding name in the article, you can return the href of the article based on the similarity between the article abstract and the editorial.
Note: You should rely on the author name in the editorial to find the article. Please analyze this carefully, think step by step, and ensure accurate identification. MAKE SURE you return in the format specified in the format. You MUST begin your answer with 'Name in the editorial is:' and end with 'The href is:'. 

# Response Format
Name in the editorial is : last name; Full name of the author in the article is: full name; Analysis between the article abstract and the editorial is: analysis; The href is: ['href']"""
    response = client.chat.completions.create(
        # model="glm-4v-flash",
        model = "Qwen/Qwen2.5-7B-Instruct",
        messages =[
            {'role': 'user', 'content': question},
            {'role': 'system', 'content': systemPrompt}
        ],
        # timeout=30,
        # max_tokens=8192,
        # temperature=0.7,
        # logprobs=True,
        # top_logprobs=5,
        )
    logger.info(f"Token used: {response.usage.total_tokens},prompt_tokens: {response.usage.prompt_tokens},completion_tokens: {response.usage.completion_tokens}")

    return response.choices[0].message.content

def article_extraction(editorial, articles):
    answer = llm_call(f'''
    One editorial summaries one article. I am searching the article related to the editorial:
    {editorial}

    The atricles and their correspending hrefs are shown as follows:
    {articles}

    Please directly output the href of the article in square bracket and add 'https://www.cell.com' in front of it. 
    ''')
    return answer


def extract_years(content):

    soup = BeautifulSoup(content, 'html.parser')

    years = soup.find_all('li', {'role': 'presentation', 'class': 'tab__nav__item'})
    years_list = []
    for year in years:
        num = year.find('a', {'class': 'tab__nav__item__link'})
        if num and num.get('title') and not 's' in num['title']:
            years_list.append(int(num['title']))
    return years_list

@retry_on_failure(max_retries=5, delay=10)
def get_earliest_year(url):

    global proxies
    url = url.replace('/group','')
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True,proxy=proxies)
        # context = browser.new_context(
        #         user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        #         viewport={'width': 1920, 'height': 1080},
        #     )
        context = browser.new_context(

            viewport={'width': 1920, 'height': 1080},
            accept_downloads=True,
            # User Agent
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',

            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1'
            },

            locale='en-US',
            timezone_id='America/New_York',
            geolocation={'latitude': 40.7128, 'longitude': -74.0060},
            permissions=['geolocation']
        )
        page = context.new_page()
        page.set_extra_http_headers(headers)

        page.goto(url, wait_until='networkidle', timeout=60000)
        logger.info("页面初始加载完成")
        # 等待主要内容容器加载
        # page.wait_for_selector(".accordion__control", timeout=60000)
        # logger.info("等待内容容器加载完成")
        content = page.content()

        # 保存content
        page.screenshot(path='debug/content.png')
        # 关闭浏览器
        years_list = extract_years(content)
        browser.close()
        logger.info(f"years_list: {years_list}")
        return years_list[-1]

def article_url_reformat(article_url):
    new_url = article_url.replace("https://doi.org/", "https://pubs.acs.org/doi/")
    return new_url

def reformat_cover_url(issue_url,current_year):
    pattern = r'/toc/([^/]+)/(\d+)/(\d+)'
    match = re.search(pattern, issue_url)
    if match:
        journal_code = match.groups()[0]
        volume = match.groups()[1]
        issue = match.groups()[2]
        return f"https://pubs.acs.org/cms/10.1021/{journal_code}.{current_year}.{volume}.issue-{issue}/asset/{journal_code}.{current_year}.{volume}.issue-{issue}.xlargecover.jpg"
    else:
        return None
@retry_on_failure(max_retries=5, delay=10)
def request_web(year_url):
    # 创建一个 Playwright 实例
    global headers2
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True,proxy=proxies)
        # context = browser.new_context(
        #         user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        #         viewport={'width': 1920, 'height': 1080},
        #     )
        context = browser.new_context(
            # 基本配置
            viewport={'width': 1920, 'height': 1080},
            accept_downloads=True,
            
            # User Agent
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',

            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1'
            },

            locale='en-US',
            timezone_id='America/New_York',
            geolocation={'latitude': 40.7128, 'longitude': -74.0060},
            permissions=['geolocation']
        )
        page = context.new_page()
        page.set_extra_http_headers(headers2)

        # page.goto(year_url, wait_until='networkidle', timeout=60000)
        page.goto(year_url, timeout=60000)
        # page.screenshot(path='debug/page_debug.png')

        try:

            try:
                cookie_button = page.locator('#onetrust-accept-btn-handler')
                if cookie_button.is_visible(timeout=5000):
                    cookie_button.click()
            except:
                pass
            
            page_content = page.content()
            # page.screenshot(path="debug/page2.png", full_page=True)
        except Exception as e:
            print(f"Error loading page: {e}")
            page.screenshot(path=f"debug/page_{year_url}.png", full_page=True)
            page_content = ""
            
        browser.close()
    return page_content

def request_web_pic(url,path):

    # 创建一个 Playwright 实例
    global headers
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True,proxy=proxies)
        # context = browser.new_context(
        #         user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        #         viewport={'width': 1920, 'height': 1080},
        #         accept_downloads=True
        #     )
        context = browser.new_context(
            # 基本配置
            viewport={'width': 1920, 'height': 1080},
            accept_downloads=True,
            
            # User Agent
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
            
            # 额外的 HTTP headers
            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1'
            },
            
            # 地理位置、时区等配置
            locale='en-US',
            timezone_id='America/New_York',
            geolocation={'latitude': 40.7128, 'longitude': -74.0060},
            permissions=['geolocation']
        )
        page = context.new_page()
        page.set_extra_http_headers(headers)
        # 需要添加更多的等待和交互逻辑
        # page.goto(url, wait_until='networkidle', timeout=60000)
        page.goto(url, timeout=60000)

        try:
            cookie_button = page.locator('#onetrust-accept-btn-handler')
            if cookie_button.is_visible(timeout=5000):
                cookie_button.click()
        except:
            pass
        page.screenshot(path='debug/debug.png')
        # 找到url的关键逻辑
        soup = BeautifulSoup(page.content(), 'html.parser')
        cover_img = soup.find('div', class_='niHeader_covers-lg pull-left').find('img')

        if cover_img and 'src' in cover_img.attrs:
            cover_url = cover_img['src']

        else:
            raise Exception("No cover image found")
        cover_url = 'https://pubs.acs.org' + cover_url
        page = context.new_page()
        page.set_extra_http_headers(headers2)
        page.goto(cover_url, wait_until='networkidle', timeout=60000)

        # 截屏
        page.screenshot(path=path)
        browser.close()
    return 

def download_cover(url,path):
    global headers
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True,proxy=proxies)
        # context = browser.new_context(
        #         user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        #         viewport={'width': 1920, 'height': 1080},
        #         accept_downloads=True
        #     )
        context = browser.new_context(
            # 基本配置
            viewport={'width': 1920, 'height': 1080},
            accept_downloads=True,
            
            # User Agent
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
            
            # 额外的 HTTP headers
            extra_http_headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1'
            },
            
            # 地理位置、时区等配置
            locale='en-US',
            timezone_id='America/New_York',
            geolocation={'latitude': 40.7128, 'longitude': -74.0060},
            permissions=['geolocation']
        )
        page = context.new_page()
        page.set_extra_http_headers(headers)
        page.goto(url, timeout=60000)
        page.screenshot(path=path)
        browser.close()
    return

def run_one_journal(start_volume, end_volume, issues_num,journal,journal_url,table=None):
    global headers2,headers
    issues = 0
    path_article = f'./Article/{journal}/'
    path_editorial = f'./Story/{journal}/'
    path_cover = f'./Cover/{journal}/'
    path_other_articles = f'./Other_Articles/{journal}/'
    if not os.path.exists(path_article):
        os.makedirs(path_article)
    if not os.path.exists(path_editorial):
        os.makedirs(path_editorial)
    if not os.path.exists(path_cover):
        os.makedirs(path_cover)
    if not os.path.exists(path_other_articles):
        os.makedirs(path_other_articles)
    logger.info(f"======Running journal: {journal}, start_volume: {start_volume}, end_volume: {end_volume}, issues_num: {issues_num}=======")
    while start_volume >= end_volume and issues < issues_num:
        logger.info(f"current start_volume: {start_volume}")
        if start_volume >= 2020:
            years = 2020
        elif start_volume >= 2010 and start_volume < 2020:
            years = 2010
        else:
            years = 2000
        target_url = journal_url + f'/d{years}.y{start_volume}'
        logger.info(f"target_url: {target_url}")
        try:
            retry_count = 0
            while retry_count <= 2:
                issues_content = request_web(target_url)
                issue_url_soup = BeautifulSoup(issues_content, 'html.parser')
                issue_urls = issue_url_soup.find_all('div', class_='loi__issue')
                issue_urls = ['https://pubs.acs.org'+issue.find('a')['href'] for issue in issue_urls]
                logger.info(f"issue_urls: {issue_urls}")
                issues += len(issue_urls)
                logger.info(f"volume: {start_volume}, issues: {issues}")
                if len(issue_urls) > 0:
                    break
                else:
                    retry_count += 1
                    logger.info(f"retry_count: {retry_count}")
        except Exception as e:
            logger.error(f"Error getting issue urls for the {journal} volume {start_volume}: {e}")
            start_volume -= 1
            continue

        for issue_num,issue_url in enumerate(issue_urls):
            data = {'Journal':journal,'Volume':start_volume,'Issue':issue_num+1,'cover image path':'','cover story path':'','cover article path':'','articles path':''}
            filename = str(start_volume)+ "_" + str(issue_num+1)
            page_content = request_web(issue_url)
            issue_soup = BeautifulSoup(page_content, 'html.parser')
            if os.path.exists(path_article+filename+'.txt') and os.path.exists(path_editorial+filename+'.txt') and os.path.exists(path_cover+filename+'.png') and os.path.exists(path_other_articles+filename+'.json'):
                logger.info(f"{filename} already downloaded")
                table.add_data(data['Journal'],
                                   data['Volume'],
                                   data['Issue'],
                                   path_cover+filename+'.png',
                                   path_editorial+filename+'.txt',
                                   path_article+filename+'.txt',
                                   path_other_articles+filename+'.json')
                continue

            # download cover
            try:
                if not os.path.exists(path_cover+filename+'.png'):  
                    cover_url = reformat_cover_url(issue_url, start_volume)
                    if cover_url:
                        logger.info(f"cover_url: {cover_url}")
                        download_cover(cover_url,path_cover+filename+'.png')
                    else:
                        logger.info(f"cover_url is issue_url: {issue_url}")
                        request_web_pic(issue_url,path_cover+filename+'.png')

                    logger.info(f"{filename} cover saved")
                    data['cover image path'] = path_cover+filename+'.png'
                else:

                    logger.info(f"{filename} cover already exists")
                    data['cover image path'] = path_cover+filename+'.png'
            except Exception as e:
                logger.error(f"{filename} cover missing:{e}")
            # with sync_playwright() as p:
            #     browser = p.chromium.launch(headless=True)
            #     context = browser.new_context(
            #             user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            #             viewport={'width': 1920, 'height': 1080},
            #             accept_downloads=True
            #         )
            #     page = context.new_page()
            #     page.set_extra_http_headers(headers2)
            #     page.goto(issue_url, wait_until='networkidle', timeout=60000)
            #     page.click('button:has-text("Download Cover")')
            #     with page.expect_request('*://*/*cover*') as request_info:  # 30秒超时
            #         page.click('button:has-text("Download Cover")')
            #     # page.wait_for_url(lambda url: url != issue_url, timeout=30000)  # 等待页面跳转完成
            #     request = request_info.value
            #     print("url:", request.url)
            #     page.screenshot(path=path_cover+filename+'.png')
            #     browser.close()

            # download editorials and article link
            try:
                if not os.path.exists(path_editorial+filename+'.txt') or not os.path.exists(path_article+filename+'.txt') or not os.path.exists(path_other_articles+filename+'.json'):
                    editorial_div = issue_soup.find('div', class_='niHeader_about-caption-content')
                    other_articles_div = issue_soup.find_all('div', class_='issue-item_metadata')
                    other_articles_dict = {}

                    try:
                        for div in other_articles_div:
                            title = div.find('h5', class_='issue-item_title').find('a')
                            if title:
                                url = 'https://pubs.acs.org'+title['href']
                            else:
                                continue
                            try:
                                abstract = div.find('span', class_='hlFld-Abstract').find('p').text.strip()
                            except Exception as e:
                                continue
                            other_articles_dict[url] = abstract
                    except Exception as e:
                        logger.error(f"{filename} other articles missing:{e}")

                    if editorial_div:
                        try:
                            with open(path_editorial+filename+'.txt', 'w',encoding='utf-8') as f:
                                f.write(editorial_div.text.strip().replace('\n', ' '))
                            logger.info(f"{filename} editorial saved")
                            data['cover story path'] = path_editorial + filename + '.txt'
                        except Exception as e:
                            logger.error(f"{filename} editorial missing:{e}")
                            table.add_data(data['Journal'],
                                   data['Volume'],
                                   data['Issue'],
                                   data['cover image path'],
                                   path_editorial + filename + '.txt',
                                   data['cover article path'],
                                   data['articles path'])
                            continue
                        
                        
                        try:
                            # 一定会有cover story才会有article link
                            article_url = article_url_reformat(editorial_div.find('a')['href'])
                            with open(path_article+filename+'.txt', 'w',encoding='utf-8') as f:
                                f.write(article_url)
                            logger.info(f"{filename} article saved")
                            data['cover article path'] = path_article + filename + '.txt'
                            if article_url in other_articles_dict:
                                del other_articles_dict[article_url]
                            with open(path_other_articles+filename+'.json', 'w',encoding='utf-8') as f:
                                json.dump(other_articles_dict, f, ensure_ascii=False, indent=4)
                            logger.info(f"{filename} other articles saved")
                            data['articles path'] = path_other_articles+filename+'.json'
                        except Exception as e:
                            logger.info(f"the cover story of {filename} donot have article link:{e}")
                            # 🚧继续完善other article和article link没有的时候的solution，需要在前面提前形成articles的prompt
                    else:
                        logger.error(f"{filename} editorial and article missing")
                        with open(path_other_articles+filename+'.json', 'w',encoding='utf-8') as f:
                            json.dump(other_articles_dict, f, ensure_ascii=False, indent=4)
                        logger.info(f"{filename} other articles saved")
                else:
                    # data['cover story path'] = path_editorial + filename + '.txt'
                    logger.info(f"{filename} editorial, article and other articles already exist")
                    data['cover story path'] = path_editorial + filename + '.txt'
                    data['cover article path'] = path_article + filename + '.txt'
                    data['articles path'] = path_other_articles+filename+'.json'
            except Exception as e:
                logger.error(f"{filename} editorial div or article div missing:{e}")
            table.add_data(data['Journal'],
                               data['Volume'],
                               data['Issue'],
                               data['cover image path'],
                               data['cover story path'],
                               data['cover article path'],
                               data['articles path'])
        start_volume -= 1
        logger.info(f"current start_volume: {start_volume}, end_volume: {end_volume}, issues_num: {issues_num}")
    # wandb.log({f'{journal}': table})



if __name__ == "__main__":
    setup_logger()
    # url = "https://pubs.acs.org/loi/chreay/group"
    # end_year = get_earliest_year(url)
    # run_one_journal(2025,end_year,1000,'test','https://pubs.acs.org/loi/chreay/group')
    start_volume = 2025
    cell_journals = read_journal_data('./CNS_cover/dataset_list.csv')
    all_issues = 0
    count = 0
    columns = ['Journal','Volume','Issue','cover image path','cover story path','cover article path','articles path']
    try:
        for journal in tqdm(cell_journals):
            # if count < 24:
            #     count += 1
            #     continue
            # if not journal['journal'] == 'One Earth':
            #     continue
            print(journal['journal'])
            table = wandb.Table(columns=columns)
            logger.info(f"+++++++++Current count: {count}+++++++++")
            try:
                end_volume = get_earliest_year(journal['link'])
            except Exception as e:
                logger.error(f"Error getting volumes for {journal['journal']}: {e}")
                continue

            logger.info(f"*******************End Volume: {end_volume}, URL: {journal['link']}*******************")
            try:
                run_one_journal(start_volume,end_volume,int(journal['number']),journal['journal'],journal['link'],table)
            except Exception as e:
                logger.error(f"Error running one journal for {journal['journal']}: {e}")
                wandb.log({f'{journal["journal"]}': table})
                continue
            all_issues += journal['number']
            logger.info(f"*******************Completed: {all_issues}**********************")
            
            count += 1
    except KeyboardInterrupt:
        # wandb.log({'ACS': table})
        raise
    except Exception as e:
        logger.error(f"Error running journals: {e}")
        # wandb.log({'ACS': table})
        raise

    
