import requests
from bs4 import BeautifulSoup
import os
import re
from playwright.sync_api import sync_playwright
import openai
import pandas as pd
import logging
from tqdm import tqdm
import wandb
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from bert_score import score
import json
# 填写你自己的api-key
wandb.login(key="75c71a00697e97575abad4cafddb5cfc37de3305")
wandb.init(project="CNS_cover", name="Cell-Spider")

def llm_call(question: str) -> str:
    # client = openai.OpenAI(base_url="http://cn.api.beer/v1",api_key="sk-Z0MdU0NAXCmiwYF_GjMe5rCO_2iFNU_FuPnS7jdcge54rdYa2yRnF6S9ngk")
    client = openai.OpenAI(base_url="https://api.siliconflow.cn/v1",api_key="sk-ogfnmwnolxpgzpisetqjbgikyqawdazfjuhcavykqyphvgvc")
    systemPrompt = """# Requirement
You are a text comparison and selection expert. Please think step by step. 
- First, extract the names from the editorial. In the editorial, the name that typically appears is the author’s last name. For example, the full name in article “Tomi K. Baikie” might appear in the form of “Baikie et al.” in the editorial.
- Then search for the corresponding names among the authors in the articles and present the full name of the author in the article. The full name MUST have the name you extracted from the editorial.
- And then select the href of the article which hasthe name you extracted in the "Author names" part, analyze the similarity between its abstract and the editorial, and provide your analysis. If you find multiple articles with the name, please return all the hrefs.
- Finally, return the href of the article you selected. The href is in the format of 'https://www.cell.com/cell/fulltext/S0092-8674(23)00979-0'.
- If you cannot find the corresponding name in the article, you can return the href of the article based on the similarity between the article abstract and the editorial.
Note: You should rely on the author name in the editorial to find the article. Please analyze this carefully, think step by step, and ensure accurate identification. MAKE SURE you return in the format specified in the format. You MUST begin your answer with 'Name in the editorial is:' and end with 'The href is:'. 

# Response Format
Name in the editorial is : last name; Full name of the author in the article is: full name; Analysis between the article abstract and the editorial is: analysis; The href is: ['href']"""
    response = client.chat.completions.create(
        # model="glm-4v-flash",
        model = "Qwen/Qwen2.5-7B-Instruct",
        messages =[
            {'role': 'user', 'content': question},
            {'role': 'system', 'content': systemPrompt}
        ],
        # timeout=30,
        # max_tokens=8192,
        # temperature=0.7,
        # logprobs=True,
        # top_logprobs=5,
        )
    logger.info(f"Token used: {response.usage.total_tokens},prompt_tokens: {response.usage.prompt_tokens},completion_tokens: {response.usage.completion_tokens}")

    return response.choices[0].message.content

def article_extraction(editorial, articles):
    answer = llm_call(f'''
    One editorial summaries one article. I am searching the article related to the editorial:
    {editorial}

    The atricles and their correspending hrefs are shown as follows:
    {articles}

    Please directly output the href of the article in square bracket and add 'https://www.cell.com' in front of it. 
    ''')
    return answer

def extract_answer_pattern(answer):
    href_pattern = r"The href is: \[(.*?)\]"
    href_matches = re.findall(href_pattern, answer)
    try:
        urls = []
        for match in href_matches:
            if match:
                # 提取所有 URL
                url_pattern = r'https?://[^\s,\'\"]+(?:\.\w+)+(?:/[^\s,\'\"]*)*'
                found_urls = re.findall(url_pattern, match)
                urls.extend(found_urls)
        return urls
    except Exception as e:
        logger.error(f"Error extracting URLs: {e}")
        return []

# start_volume = get_volumes(cell_url)
# request_web_in_volume(cell_url)

url_base = 'https://www.cell.com'
# 替换原来的静态卷号定义

proxies = {
  'http': '',
  'https': '',
}
kv = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Cookie': 'OptanonAlertBoxClosed=2024-12-02T05:43:25.876Z; MAID=7SztV6y3n2LkV/lHzOiE7A==; at_check=true; AMCVS_4D6368F454EC41940A4C98A6%40AdobeOrg=1; cf_clearance=Dg69vf_atkTIx0BbnMWFTLzyHz9e8Mps0YTAWtrxAfM-1734589507-1.2.1.1-Yt4So.wzyw3gkPK3LrMvS.SxGK9y2_ToLUKLMQjaPtR0GpMux_7LR5GCX_trXmxXVB_g70_6GuGzgAk.uODonFxFPulJD3ugl6e7xU1PE1uifXa3phlJD.Kwv3PmLtPe32iMyIVPOY2hgV7tQwo_dkGV3htYso4lowSmnDF2ACU9IkAm074_0sSC7J7ytBSTDCWomUCb5_WvskWm6cpnN1fmcJRsIPj68EwOKTLSnXkdFzZfXaK6h9_O5xXep_VAN3wlbPJEPq5En.dPGjfwRgDrDAoE57K60K4NRvryHZXAZyWz3gnPOw3pRRijwhaSyhAPkwvxKkzqE.hJgKwh6bhCavxXO31Q6qtyCOzy9NcsZgi1spoga9JtEKc9VdpqFzlvjgUTK5QoNG8ixGGStQ; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=179643557%7CMCIDTS%7C20076%7CMCMID%7C82070259187192443663734658279114766445%7CMCAID%7CNONE%7CMCOPTOUT-1734596878s%7CNONE%7CvVersion%7C5.5.0; OptanonConsent=isGpcEnabled=0&datestamp=Thu+Dec+19+2024+14%3A27%3A58+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202402.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=a7b9ace1-acfe-4d1b-933f-36af08d827f1&interactionCount=1&isAnonUser=1&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C2%3A1%2C4%3A1&geolocation=%3B&AwaitingReconsent=false; MACHINE_LAST_SEEN=2024-12-18T23%3A16%3A22.569-08%3A00; JSESSIONID=aaafGU2xWAcd3_4_zpypz; __cf_bm=_klulnadNjuIJ2Hm5Zk8Sr9CdmSgKtP6cwEMF9gCHbU-1734592582-1.0.1.1-Ww9iU86rdW4X5hJ0Fv565NkVVownF6Q2rirjMrR.D223DtDh_eaxjK5HkT3AsXIG5DggKWkF4SPab5PvrZO4xA'
}
headers2 = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Cookie': 'OptanonAlertBoxClosed=2024-12-02T05:43:25.876Z; MAID=7SztV6y3n2LkV/lHzOiE7A==; at_check=true; AMCVS_4D6368F454EC41940A4C98A6%40AdobeOrg=1; MACHINE_LAST_SEEN=2024-12-19T18%3A39%3A13.105-08%3A00; JSESSIONID=aaadJ4s6vmRN7a364Fypz; cf_clearance=jrsR.Lt3e4UwkSzoTuiikVsXb0dX.Fx4kyI5gRKLxME-1734663979-1.2.1.1-J_ZXvzYVlcnKoNM.bU1mOYTPMdiNPZgifmULkfUQObI.bFOA2haCW9UIwd1L9VW5dPfTVmBXcS5BDgnytt.UNRRwKv7I46td7WYvs8iyTgRSmB1DC.AqZrOjof560ahX3FrxIN_DxRvlPbJ8EtU1g2c18lgAi1qlZoWoW24cjgmk.FWiHsqKaE1UeIAmGslp7qe8XtRL1J3m0YvgZRJ0InyogMFNDr.yo405QqonbDaA6GhHA341imAEH0fSWjys4oaLeCRagyCpnEXTzDfvennRsb21peKoEk5btedrVp8ldB8eIcwp8GPsnUIaig5G2n6rBR4FCxNgF9pN2geg.WJv3oLQ3qdhXYPhsFB28i2UIXO9zN0iTwcyZDuxMptNoaXEH52s82uw18xBFd.6Gg; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=179643557%7CMCIDTS%7C20078%7CMCMID%7C82070259187192443663734658279114766445%7CMCAID%7CNONE%7CMCOPTOUT-1734671189s%7CNONE%7CvVersion%7C5.5.0; OptanonConsent=isGpcEnabled=0&datestamp=Fri+Dec+20+2024+11%3A06%3A29+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202402.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=a7b9ace1-acfe-4d1b-933f-36af08d827f1&interactionCount=1&isAnonUser=1&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C2%3A1%2C4%3A1&geolocation=%3B&AwaitingReconsent=false'
}

# 创建全局logger对象
logger = logging.getLogger('journal_processor')

def setup_logger(log_file: str = "./debug/journal_processing.log") -> None:
    """
    设置全局日志记录器
    """
    # 确保日志文件所在目录存在
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    
    # 设置日志级别
    logger.setLevel(logging.INFO)
    
    # 如果logger已经有处理器，先清除
    if logger.handlers:
        logger.handlers.clear()
    
    # 创建文件处理器
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.INFO)
    
    # 设置日志格式
    formatter = logging.Formatter('%(asctime)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # 添加处理器
    logger.addHandler(file_handler)


# start and end volume
# start_volume= 186
# end_volume = 140



def extract_volumes(html_content):
    """
    从HTML内容中提取所有的volume信息并排序
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    volumes = []
    
    # 找到list-of-issues容器
    issues_container = soup.find(class_="list-of-issues")
    if not issues_container:
        return []
    journal_info = {}
    journal_info['years'] = []
    # 查找所有volume信息
    # 这些信息在list-of-issues__group__wrapper--issues类的div中
    volume_wrappers = issues_container.find_all("div", class_="list-of-issues__group__wrapper--issues")
    former_years = 2020
    for index,wrapper in enumerate(volume_wrappers):
        # 在每个wrapper中找到包含Volume信息的h3标签
        volume_header = wrapper.find("h3")
        if volume_header:
            # 获取到链接<a>标签，同时识别出journal_code和years更换的volume编号
            volume_link = volume_header.find('a')
            if volume_link:
                if index == 0:
                    publication_code = re.search(r'publicationCode=([^&]+)', volume_link['href']).group(1)
                    journal_info['publication_code'] = publication_code
                cur_years, volume = re.search(r'd(\d{4})\.v(\d+)', volume_link['data-groupid']).groups()
                volumes.append(int(volume))
                if int(cur_years) < former_years:
                    journal_info['years'].append(int(volume))  # journal_info['years']中存的是写着2010和2000的第一个volume编号（即大于这个的是2020和2010）
                    former_years = int(cur_years)
            # volume_text = volume_header.get_text(strip=True)
            # # 使用正则表达式提取volume数字
            # volume_match = re.search(r'Volume\s+(\d+)', volume_text)
            # if volume_match:
            #     volumes.append(int(volume_match.group(1)))
    
    # 对volumes列表进行排序
    return sorted(volumes),journal_info

def request_web(year_url):
    # 创建一个 Playwright 实例
    global headers2
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
            )
        page = context.new_page()
        page.set_extra_http_headers(headers2)
        # 需要添加更多的等待和交互逻辑
        page.goto(year_url, wait_until='networkidle', timeout=60000)
        
        # 添加等待特定元素出现的逻辑
        try:
            # 处理可能出现的 Cookie 提示
            try:
                cookie_button = page.locator('#onetrust-accept-btn-handler')
                if cookie_button.is_visible(timeout=5000):
                    cookie_button.click()
            except:
                pass
            
            page_content = page.content()
            # page.screenshot(path="debug/page2.png", full_page=True)
        except Exception as e:
            print(f"Error loading page: {e}")
            page.screenshot(path=f"debug/page_{year_url}.png", full_page=True)
            page_content = ""
            
        browser.close()
    return page_content

def get_volumes(url):
    global headers
    with sync_playwright() as p:
        # 启动浏览器
        browser = p.chromium.launch()
        page = browser.new_page()
        logger.info("正在访问网址...")
        
        page.set_extra_http_headers(headers)
        # 等待页面加载完成
        page.goto(url, wait_until='networkidle', timeout=60000)
        logger.info("页面初始加载完成")
        
        
        # 等待主要内容容器加载
        page.wait_for_selector(".accordion__control", timeout=60000)
        logger.info("等待内容容器加载完成")
        
        # # 展开所有折叠组
        # while True:
        #     try:
        #         # 查找所有未展开的折叠组
        #         collapse_buttons = page.query_selector_all(".accordion__control[aria-expanded='false']")
        #         if not collapse_buttons:
        #             break
                
        #         for button in collapse_buttons:
        #             try:
        #                 button.click()
        #                 # 等待展开动画完成
        #                 page.wait_for_timeout(500)
        #             except Exception as e:
        #                 print(f"展开折叠组时出错: {str(e)}")
        #                 continue
                
        #     except Exception as e:
        #         print(f"处理折叠组时出错: {str(e)}")
        #         break
        
        # # 最后等待一下确保所有内容都加载完成
        # page.wait_for_timeout(2000)
        # print("所有折叠组展开完成")
        
        # 获取页面内容
        content = page.content()
        
        # 关闭浏览器
        browser.close()
        volumes,journal_info = extract_volumes(content)

        # 去重并排序
        volumes = sorted(list(set(volumes)))
        return max(volumes),journal_info

# 使用方法



def request_web_pic(url,path):
    # 创建一个 Playwright 实例
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                viewport={'width': 1920, 'height': 1080},
                accept_downloads=True
            )
        page = context.new_page()
        page.set_extra_http_headers(headers2)
        # 需要添加更多的等待和交互逻辑
        page.goto(url, wait_until='networkidle', timeout=60000)
        try:
            cookie_button = page.locator('#onetrust-accept-btn-handler')
            if cookie_button.is_visible(timeout=5000):
                cookie_button.click()
        except:
            pass
        # 找到url的关键逻辑
        soup = BeautifulSoup(page.content(), 'html.parser')
        cover_img = soup.find('div', id='fullCover').find('img')

        if cover_img and 'src' in cover_img.attrs:
            cover_url = cover_img['src']

        else:
            raise Exception("No cover image found")
        cover_url = url_base + cover_url

        page = context.new_page()
        page.set_extra_http_headers(headers2)
        page.goto(cover_url, wait_until='networkidle', timeout=60000)

        # 截屏
        page.screenshot(path=path)
        browser.close()
    return 

def run_one_journal(start_volume, end_volume, issues_num,journal,journal_url,publication_code,threshold_years:list,table=None):
    global headers2
    
    
    # 首先要对threshold_years预处理
    final_threshold = [0,0]
    # 如果没有值，不变
    
    # 如果只有一个值
    if len(threshold_years) == 1:
        final_threshold[0] = threshold_years[0]
    # 如果有两个值
    elif len(threshold_years) >= 2:
        final_threshold[0] = threshold_years[0]
        final_threshold[1] = threshold_years[1]
    logger.info(f"final_threshold: {final_threshold}")
    issues = 0
    path_article = f'./Article/{journal}/'
    path_editorial = f'./Story/{journal}/'
    path_cover = f'./Cover/{journal}/'
    path_other_articles = f'./Other_Articles/{journal}/'
    if not os.path.exists(path_article):
        os.makedirs(path_article)
    if not os.path.exists(path_editorial):
        os.makedirs(path_editorial)
    if not os.path.exists(path_cover):
        os.makedirs(path_cover)
    if not os.path.exists(path_other_articles):
        os.makedirs(path_other_articles)
    logger.info(f"======Running journal: {journal}, start_volume: {start_volume}, end_volume: {end_volume}, issues_num: {issues_num}=======")
    while start_volume >= end_volume and issues < issues_num:
        # for debug
        # if start_volume == 7:
        #     break
        # if start_volume > 8:
        #     start_volume -= 1
        #     logger.info(f"for debug in {start_volume}")
        #     continue
        # debug end
        logger.info(f"current start_volume: {start_volume}")
        if start_volume > final_threshold[0]:
            years = 2020
        elif start_volume > final_threshold[1] and start_volume <= final_threshold[0]:
            years = 2010
        else:
            years = 2000
        # 从年份界面获取每个issue网页的链接
        target_url = journal_url + f'archive?issueGroupId=d{years}.v{start_volume}&publicationCode={publication_code}'
        logger.info(f"target_url: {target_url}")
        try:
            # page_content = request_web(f'https://www.cell.com/developmental-cell/archive?issueGroupId=d{years}.v{start_volume}&publicationCode=devcel')
            page_content = request_web(target_url)
            
            # page_content = request_web(f'https://www.cell.com/cell/archive?issueGroupId=d{years}.v{start_volume}&publicationCode=cell')
            issue_url_soup = BeautifulSoup(page_content, 'html.parser')
            issue_urls = issue_url_soup.find_all('div',attrs={'data-groupid':'d'+str(years)+'.v'+str(start_volume)})
            issue_urls=issue_urls[1].find_all('a')
            logger.info(f"issue_urls: {issue_urls}")
            issues += len(issue_urls)
            logger.info(f"volume: {start_volume}, issues: {issues}")
        except Exception as e:
            logger.error(f"Error getting issue urls for {journal}: {e}")
            start_volume -= 1
            continue
        # 从issue界面下载图片、article链接与editorial
        for issue_num,issue_url in enumerate(issue_urls):
            data = {'Journal':journal,'Volume':start_volume,'Issue':issue_num+1}
            filename = str(start_volume)+ "_" + str(issue_num+1)
            # 首先检查如果已经下载过，continue
            if os.path.exists(path_article+filename+'.txt') and os.path.exists(path_editorial+filename+'.txt') and os.path.exists(path_cover+filename+'.png') and os.path.exists(path_other_articles+filename+'.json'):
                logger.info(f"{filename} already downloaded")
                table.add_data(data['Journal'],
                                   data['Volume'],
                                   data['Issue'],
                                   path_cover+filename+'.png',
                                   path_editorial+filename+'.txt',
                                   path_article+filename+'.txt',
                                   path_other_articles+filename+'.json')
                continue
            try:
                issue_url = 'https://www.cell.com' + issue_url['href']
                logger.info(f"issue_url: {issue_url}")
                page_content = request_web(issue_url)
                issue_soup = BeautifulSoup(page_content, 'html.parser')
            except Exception as e:
                logger.error(f"{filename} missing:{e}")            


            # download cover
            # try:
            # cover_url = url_base + issue_soup.find('img', class_='toc-cover__wrapper--image')['src']
            try:
                if not os.path.exists(path_cover+filename+'.png'):  
                    cover_url = issue_url + issue_soup.find('a', class_='toc-cover__wrapper--image')['href']
                    path = path_cover + filename + '.png'
                    # 使用playwright下载图片
                    request_web_pic(cover_url, path)
                    data['cover image path'] = path
                    logger.info(f"{filename} cover saved")
                else:
                    data['cover image path'] = path_cover + filename + '.png'
                    logger.info(f"{filename} cover already exists")
            except Exception as e:
                data['cover image path'] = ''
                logger.error(f"{filename} cover missing:{e}")
            # download editorials
            try:
                if not os.path.exists(path_editorial+filename+'.txt'):
                    article_source = editorial = issue_soup.find('div', class_='article-header__info__scrollable')
                    editorial = editorial.get_text()
                    with open(path_editorial + filename + '.txt', 'w',encoding='utf-8') as f:
                        f.write(editorial)
                    data['cover story path'] = path_editorial + filename + '.txt'
                    logger.info(f"{filename} editorial saved")
                else:
                    data['cover story path'] = path_editorial + filename + '.txt'
                    logger.info(f"{filename} editorial already exists")
            except Exception as e:
                data['cover story path'] = ''
                logger.error(f"{filename} editorail missing:{e}")

            # download articles
            try:
                # if not os.path.exists(path_article+filename+'.txt'):
                # 为了测试，改为只有有的才获取
                if not os.path.exists(path_article+filename+'.txt') or not os.path.exists(path_other_articles+filename+'.json'):

                    editorial = issue_soup.find('div', class_='article-header__info__scrollable')
                    article_link = editorial.find('a')
                    # if article_link and 'href' in article_link.attrs and 'www.cell.com' in article_link['href']:
                    #     # 如果已经在cover story中有
                    #     logger.info(f"Have article link in editorial")
                    #     article_url = article_link['href']
                    #     with open(path_article + filename + '.txt', "w",encoding='utf-8') as file:
                    #         file.write(article_url)
                    #     logger.info(f"{filename} article success")
                    #     print(f"{filename} article success")
                    # else:
                    article_urls=issue_soup.find_all('div', class_='toc__item__detials col-md-9 col-lg-10')
                    list=[]
                    content_dict = {}
                    for article_url in article_urls:
                        authors_ul = article_url.find('ul', class_='toc__item__authors loa rlist--inline')
                        if authors_ul:
                            authors = authors_ul.find_all('li', class_='loa__item')
                        else:
                            continue
                        brief = article_url.find('div', class_='toc__item__brief')
                        author_names = [author.text.strip().rstrip(',') for author in authors]
                        author_names = ", ".join(author_names)
                        url = "https://www.cell.com" + article_url.find('a')['href']
                        # page = request_web(url)
                        # page_soup = BeautifulSoup(page, 'html.parser')
                        # page_num = page_soup.find('span', class_='meta-panel__pages')
                        # content = "Author names: " + author_names + "; " +"Page Number: " + page_num.text.strip() + "; " + "Article link: " + url
                        # logger.info(f"content: {content}")

                        # method 2
                        content = f"""- Author names: {author_names}; 
- Article link: {url}
- Abstract of the article: {brief.text.strip()}
"""
                        content_dict[url] = brief.text.strip()

                        list.append(content)
                    article_href="\n\n".join(list)
                    
                    # 调用LLM
                    answer = article_extraction(editorial.get_text(), article_href)
                    logger.info(f"answer: {answer}")
                    # pattern = r"\['(.*?)'\]"
                    # pattern = r"\[\s*['\"]?(https?://[^\s'\"]+)['\"]?\s*\]"
                    
                    matches = extract_answer_pattern(answer)  
                    with open(path_article+filename+'.txt', "w",encoding='utf-8') as file:
                        for i in matches:
                            file.write(i+'\n')
                            try:
                                del content_dict[i]
                            except Exception as e:
                                logger.error(f"Error deleting content_dict: {e}")
                                continue

                    with open(path_other_articles+filename+'.json', "w") as json_file:
                        json.dump(content_dict, json_file,indent=4)
                    data['cover article path'] = path_article + filename + '.txt'
                    data['articles path'] = path_other_articles+filename+'.json'

                    logger.info(f"{filename} article success")
                    
                        
                else:
                    data['cover article path'] = path_article + filename + '.txt'
                    data['articles path'] = path_other_articles+filename+'.json'
                    logger.info(f"{filename} article already exists")
            except Exception as e:
                data['cover article path'] = ''
                data['articles path'] = ''
                logger.error(f"{filename} article missing:{e}")
            table.add_data(data['Journal'],
                                   data['Volume'],
                                   data['Issue'],
                                   data['cover image path'],
                                   data['cover story path'],
                                   data['cover article path'],
                                   data['articles path'])
                
        start_volume -= 1
        logger.info(f"current start_volume: {start_volume}, end_volume: {end_volume}, issues_num: {issues}")
    
    
def read_journal_data(csv_path: str) -> list:
    """
    读取CSV文件并处理期刊数据
    
    Args:
        csv_path: CSV文件路径
    
    Returns:
        list: 包含符合条件的期刊信息的列表
    """
    # 读取CSV文件
    df = pd.read_csv(csv_path)

    # 清理数据：去除空格，转换为字符串
    df['Journal'] = df['Journal'].str.strip()
    df['Link'] = df['Link'].str.strip()
    
    # 创建结果列表
    cell_journals = []
    
    # 遍历每一行
    for _, row in df.iterrows():
        journal = row['Journal']
        link = str(row['Link'])  # 转换为字符串以处理可能的nan值
        number = row['Number']
        
        # 检查link是否有效且是cell.com网站
        if isinstance(link, str) and 'cell.com' in link:
            cell_journals.append({
                'journal': journal,
                'link': link,
                'number': number
            })
    
    return cell_journals

if __name__ == "__main__":
    setup_logger()
    end_volume = 1
    cell_journals = read_journal_data('/home/ubuntu/scratch/mhjiang/CNS_cover/dataset_list.csv')
    all_issues = 0
    count = 0
    columns = ['Journal','Volume','Issue','cover image path','cover story path','cover article path','articles path']
    table = wandb.Table(columns=columns)
    for journal in tqdm(cell_journals):
        # if count < 24:
        #     count += 1
        #     continue
        # if not journal['journal'] == 'One Earth':
        #     continue
        print(journal['journal'])
        logger.info(f"+++++++++Current count: {count}+++++++++")
        try:
            volumes,journal_info = get_volumes(journal['link'])
        except Exception as e:
            logger.error(f"Error getting volumes for {journal['journal']}: {e}")
            continue
        try:
            pattern = r'(https://www\.cell\.com/[^/]+(?:/[^/]+)?(?=/(?:archive|issues)))'
            match = re.search(pattern, journal['link'])
            journal['link'] = match.group(1)+'/'
        except Exception as e:
            logger.error(f"Error getting journal link for {journal['journal']}: {e}")
            continue
        logger.info(f"*******************Volumes: {volumes}, Journal Info: {journal_info}, URL: {journal['link']}*******************")
        try:
            run_one_journal(volumes,end_volume,int(journal['number']),journal['journal'],journal['link'],journal_info['publication_code'],journal_info['years'],table)
        except Exception as e:
            logger.error(f"Error running one journal for {journal['journal']}: {e}")
            continue
        all_issues += journal['number']
        logger.info(f"*******************Completed: {all_issues}**********************")
        count += 1
    wandb.log({'Cell': table})
    # start the loop for each volume and download editorials in each volume
    