import requests
from bs4 import BeautifulSoup
import os
import json
from playwright.sync_api import sync_playwright
import logging
import wandb
from tqdm import tqdm
import pandas as pd
# 初始化wandb
wandb.login(key="75c71a00697e97575abad4cafddb5cfc37de3305")
wandb.init(project="CNS_cover", name="Science-Spider")


headers2 = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Cookie': 'OptanonAlertBoxClosed=2024-12-02T05:43:25.876Z; MAID=7SztV6y3n2LkV/lHzOiE7A==; at_check=true; AMCVS_4D6368F454EC41940A4C98A6%40AdobeOrg=1; MACHINE_LAST_SEEN=2024-12-19T18%3A39%3A13.105-08%3A00; JSESSIONID=aaadJ4s6vmRN7a364Fypz; cf_clearance=jrsR.Lt3e4UwkSzoTuiikVsXb0dX.Fx4kyI5gRKLxME-1734663979-1.2.1.1-J_ZXvzYVlcnKoNM.bU1mOYTPMdiNPZgifmULkfUQObI.bFOA2haCW9UIwd1L9VW5dPfTVmBXcS5BDgnytt.UNRRwKv7I46td7WYvs8iyTgRSmB1DC.AqZrOjof560ahX3FrxIN_DxRvlPbJ8EtU1g2c18lgAi1qlZoWoW24cjgmk.FWiHsqKaE1UeIAmGslp7qe8XtRL1J3m0YvgZRJ0InyogMFNDr.yo405QqonbDaA6GhHA341imAEH0fSWjys4oaLeCRagyCpnEXTzDfvennRsb21peKoEk5btedrVp8ldB8eIcwp8GPsnUIaig5G2n6rBR4FCxNgF9pN2geg.WJv3oLQ3qdhXYPhsFB28i2UIXO9zN0iTwcyZDuxMptNoaXEH52s82uw18xBFd.6Gg; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=179643557%7CMCIDTS%7C20078%7CMCMID%7C82070259187192443663734658279114766445%7CMCAID%7CNONE%7CMCOPTOUT-1734671189s%7CNONE%7CvVersion%7C5.5.0; OptanonConsent=isGpcEnabled=0&datestamp=Fri+Dec+20+2024+11%3A06%3A29+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202402.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=a7b9ace1-acfe-4d1b-933f-36af08d827f1&interactionCount=1&isAnonUser=1&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C2%3A1%2C4%3A1&geolocation=%3B&AwaitingReconsent=false'
}

# 创建目录


# 设置logger
logger = logging.getLogger('science_processor')

def setup_logger(log_file: str = "./debug/science_processing.log") -> None:
    """设置日志记录器"""
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logger.setLevel(logging.INFO)
    
    if logger.handlers:
        logger.handlers.clear()
    
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

def read_journal_data(csv_path: str) -> list:
    """
    读取CSV文件并处理期刊数据
    
    Args:
        csv_path: CSV文件路径
    
    Returns:
        list: 包含符合条件的期刊信息的列表
    """
    # 读取CSV文件
    df = pd.read_csv(csv_path)

    # 清理数据：去除空格，转换为字符串
    df['Journal'] = df['Journal'].str.strip()
    df['Link'] = df['Link'].str.strip()
    
    # 创建结果列表
    cell_journals = []
    
    # 遍历每一行
    for _, row in df.iterrows():
        journal = row['Journal']
        link = str(row['Link'])  # 转换为字符串以处理可能的nan值
        number = row['Number']
        
        # 检查link是否有效且是cell.com网站
        if isinstance(link, str) and 'science.org' in link:
            cell_journals.append({
                'journal': journal,
                'link': link,
                'number': number
            })
    
    return cell_journals

def request_web(url):
    """使用playwright请求网页"""
    global headers2
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
            )
        page = context.new_page()
        page.set_extra_http_headers(headers2)
        # 需要添加更多的等待和交互逻辑
        
        
        try:
            page.goto(url, wait_until='networkidle', timeout=60000)
            page_content = page.content()
        except Exception as e:
            logger.error(f"Error loading page: {e}")
            page.screenshot(path=f"debug/page_{url.split('/')[-1]}.png", full_page=True)
            page_content = ""
            
        browser.close()
    return page_content

def request_web_pic(url, path):
    """下载封面图片"""
    global headers2
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
            )
        page = context.new_page()
        page.set_extra_http_headers(headers2)
        
        try:
            page.goto(url)
            page.wait_for_timeout(5000)
            page.screenshot(path=path)
        except Exception as e:
            logger.error(f"Error downloading image: {e}")
        finally:
            browser.close()

def run_one_journal(start_year, end_year, issues_num,journal,journal_url):
    """运行Science期刊爬虫"""
    path_article = f'./Article/{journal}/'
    path_editorial = f'./Story/{journal}/'
    path_cover = f'./Cover/{journal}/'
    path_other_articles = f'./Other_Articles/{journal}/'
    for path in [path_article, path_editorial, path_cover, path_other_articles]:
        if not os.path.exists(path):
            os.makedirs(path)

    columns = ['Journal', 'Year', 'Issue', 'cover image path', 'cover story path', 'cover article path', 'articles path']
    table = wandb.Table(columns=columns)
    science_url = journal_url
    issues = 0
    
    while start_year >= end_year and issues < issues_num:
        # 从年份界面获取每个issue网页的链接
        try:
            year_url = science_url + 'd'+str(start_year//10*10)+'.y'+ str(start_year)
            logger.info(f"target url: {year_url}, cur year: {start_year}")
            page_content = request_web(year_url)
        except Exception as e:
            logger.error(f"Error when getting year {str(start_year)}: {e}")
            start_year -= 1
            continue

        issue_url_soup = BeautifulSoup(page_content, 'html.parser')
        issue_urls = issue_url_soup.find_all('div', class_='col-12 col-sm-3 col-lg-2 mb-4 mb-sm-3')
        issue_urls_list = []
        for issue_url in issue_urls:
            issue_urls_list.append(issue_url.find('a')['href'])
        logger.info(f"issue urls: {issue_urls_list}")
        issues += len(issue_urls_list)
        # 从issue界面下载图片、article链接与editorial
        for issue_url in issue_urls_list[:]:
            numbers = issue_url.split("/")[3:]
            volume = numbers[0]
            issue = numbers[1]
            filename = volume + "_" + issue 
            try:
                issue_url = 'https://www.science.org' + issue_url
                logger.info(f"issue url: {issue_url}")
                page_content = request_web(issue_url)
                issue_soup = BeautifulSoup(page_content, 'html.parser')
            except Exception as e:
                logger.error(f"Error when getting issue {filename}: {e}")
            data = {
                'Journal': journal,
                'Year': volume,
                'Issue': issue,
            }
            if os.path.exists(path_cover + filename + '.png') and os.path.exists(path_editorial + filename + '.txt') and os.path.exists(path_article + filename + '.txt') and os.path.exists(path_other_articles + filename + '.json'):
                logger.info(f"Issue {filename} already exists")
                data['cover image path'] = path_cover + filename + '.png'
                data['cover story path'] = path_editorial + filename + '.txt'
                data['cover article path'] = path_article + filename + '.txt'
                data['articles path'] = path_other_articles + filename + '.json'
                table.add_data(data['Journal'],
                                data['Year'],
                                data['Issue'], 
                                data['cover image path'], 
                                data['cover story path'], 
                                data['cover article path'], 
                                data['articles path'])
                continue
            # download cover
            try:
                if not os.path.exists(path_cover + filename + '.png'):
                    cover_url = 'https://www.science.org' + issue_soup.find('div', class_='cover-image__image').find('img').get('src')
                    path = path_cover + filename + '.png'
                    img_response = request_web_pic(cover_url, path)
                    # with open(path_cover + filename + '.png', 'wb') as f:
                    #     f.write(img_response)
                    data['cover image path'] = path_cover + filename + '.png'
                else:
                    logger.info(f"Cover {filename} already exists")
                    data['cover image path'] = path_cover + filename + '.png'
            except Exception as e:
                logger.error(f"Error when getting issue {filename} cover: {e}")
                data['cover image path'] = ''

                    

            # download editorials
            try:
                # 保存issue soup
                if not os.path.exists(path_editorial + filename + '.txt'):
                    article_source = editorial = issue_soup.find('div', class_='text-ellipses collapse truncated')
                    if not article_source:
                        article_source = editorial = issue_soup.find('div', class_='text-ellipses collapse')
                    editorial = editorial.get_text()
                    with open(path_editorial + filename + '.txt', 'w',encoding='utf-8') as f:
                        f.write(editorial)
                    logger.info(f"{filename} editorial saved")
                    data['cover story path'] = path_editorial + filename + '.txt'
                else:
                    logger.info(f"Editorial {filename} already exists")
                    data['cover story path'] = path_editorial + filename + '.txt'
                
            except Exception as e:
                logger.error(f"Error when getting issue {filename} editorial: {e}")
                data['cover story path'] = ''
            # download articles
            try:
                if not os.path.exists(path_article + filename + '.txt') or not os.path.exists(path_other_articles + filename + '.json'):
                    # article_source = editorial = issue_soup.find('div', class_='text-ellipses collapse truncated')    # 这里的修改可能会导致bug，注意关注
                    article_urls = article_source.find_all('a')
                    article_urls_list = []
                    for url in article_urls:
                        article_urls_list.append(url)
                    articles = issue_soup.find_all('div', class_='card border-bottom pb-3 mb-3')
                    content_dict = {}
                    for article in articles:
                        href = "https://www.science.org" + article.find('a')['href']
                        abstract = article.find('div', class_ = 'accordion__content')
                        if not abstract:
                            continue
                        abstract = abstract.text.strip()
                        content_dict[href] = abstract
                    with open(path_article + filename + '.txt', 'w') as f:
                        for article_url in article_urls_list:
                            f.write(str(article_url['href']) + '\n')
                    with open(path_other_articles + filename + '.json', 'w') as f:
                        json.dump(content_dict, f, indent=4)
                    data['articles path'] = path_other_articles + filename + '.json'
                    data['cover article path'] = path_article + filename + '.txt'
                    
                    logger.info(f"{filename} article saved")
                    logger.info(f"{filename} other articles saved")

                else:
                    logger.info(f"Article {filename} already exists")
                    logger.info(f"Other articles {filename} already exists")
                    data['articles path'] = path_other_articles + filename + '.json'
                    data['cover article path'] = path_article + filename + '.txt'
            except Exception as e:
                logger.error(f"Error when getting issue {filename} article: {e}")
                data['articles path'] = ''
                data['cover article path'] = ''
            table.add_data(data['Journal'],
                            data['Year'],
                            data['Issue'], 
                            data['cover image path'], 
                            data['cover story path'], 
                            data['cover article path'], 
                            data['articles path'])
        start_year -= 1
        
    wandb.log({f'{journal}': table})

if __name__ == "__main__":
    setup_logger()
    end_volume = 1
    cell_journals = read_journal_data('/home/ubuntu/scratch/mhjiang/CNS_cover/dataset_list.csv')
    all_issues = 0
    count = 0
    for journal in tqdm(cell_journals):
        # if count < 24:
        #     count += 1
        #     continue
        # if not journal['journal'] == 'Joule':
        #     continue
        print(journal['journal'])
        logger.info(f"+++++++++Current count: {count}+++++++++")
        start_year = 2025
        end_year = 2005
        logger.info(f"*******************Start Years: {start_year}, End Year: {end_year}, URL: {journal['link']}*******************")
        try:
            run_one_journal(start_year,end_year,journal['number'],journal['journal'],journal['link'])
        except Exception as e:
            logger.error(f"Error running one journal for {journal['journal']}: {e}")
            continue
        all_issues += journal['number']
        logger.info(f"*******************Completed: {all_issues}**********************")
        count += 1