import requests
from bs4 import BeautifulSoup
import os
import re
import openai
import logging
import pandas as pd
from tqdm import tqdm
import json
import wandb
# wandb.login(key="75c71a00697e97575abad4cafddb5cfc37de3305")
# wandb.init(project="CNS_cover", name="Nature-Spider")

logger = logging.getLogger('journal_processor')

def setup_logger(log_file: str = "./debug/journal_processing.log") -> None:
    """
    设置全局日志记录器
    """
    # 确保日志文件所在目录存在
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    
    # 设置日志级别
    logger.setLevel(logging.INFO)
    
    # 如果logger已经有处理器，先清除
    if logger.handlers:
        logger.handlers.clear()
    
    # 创建文件处理器
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.INFO)
    
    # 设置日志格式
    formatter = logging.Formatter('%(asctime)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # 添加处理器
    logger.addHandler(file_handler)
proxies = {
  'http': '',
  'https': '',
}
kv = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}

def extract_volumes(journal_url):
    global kv
    journal_response = requests.get(url=journal_url, headers=kv, timeout=120, proxies=proxies)
    journal_response.encoding = 'utf-8'
    journal_soup = BeautifulSoup(journal_response.text, 'html.parser')
    # 查找所有符合条件的volume链接
    volume_links = journal_soup.find_all('a', 
        attrs={
            'href': re.compile(r'/\w+/volumes/\d+'),
            'data-track': 'click',
            'data-track-action': 'view volume'
        }
    )
    
    # 提取volume编号
    volumes = []
    for link in volume_links:
        # 从链接文本中提取数字
        volume_text = link.text.strip()
        if 'Volume' in volume_text:
            volume_num = int(volume_text.replace('Volume', '').strip())
            volumes.append(volume_num)
            
        # 或从href中提取数字
        # href = link['href']
        # volume_num = int(href.split('/')[-1])
        # volumes.append(volume_num)
    
    # 排序并去重
    volumes = sorted(list(set(volumes)), reverse=True)
    
    return max(volumes)




# start and end volume

# nature_url = 'https://www.nature.com/ng/volumes/' # nature genetics


# 这个是调用api的函数
def llm_call(question: str) -> str:
    # client = openai.OpenAI(base_url="http://cn.api.beer/v1",api_key="sk-Z0MdU0NAXCmiwYF_GjMe5rCO_2iFNU_FuPnS7jdcge54rdYa2yRnF6S9ngk")
    client = openai.OpenAI(base_url="https://api.siliconflow.cn/v1",api_key="sk-ogfnmwnolxpgzpisetqjbgikyqawdazfjuhcavykqyphvgvc")
    systemPrompt = """# Requirement
You are a text comparison and selection expert. Please think step by step. 
- First, extract the names from the editorial. In the editorial, the name that typically appears is the author’s last name. For example, the full name in article “Tomi K. Baikie” might appear in the form of “Baikie et al.” in the editorial.
- Then search for the corresponding names among the authors in the articles and present the full name of the author in the article. The full name MUST have the name you extracted from the editorial.
- And then select the href of the article which hasthe name you extracted in the "Author names" part, analyze the similarity between its abstract and the editorial, and provide your analysis. If you find multiple articles with the name, please return all the hrefs.
- Finally, return the href of the article you selected. The href is in the format of 'https://www.cell.com/cell/fulltext/S0092-8674(23)00979-0'.
- If you cannot find the corresponding name in the article, you can return the href of the article based on the similarity between the article abstract and the editorial.
Note: You should rely on the author name in the editorial to find the article. Please analyze this carefully, think step by step, and ensure accurate identification. MAKE SURE you return in the format specified in the format. You MUST begin your answer with 'Name in the editorial is:' and end with 'The href is:'. 

# Response Format
Name in the editorial is : last name; Full name of the author in the article is: full name; Analysis between the article abstract and the editorial is: analysis; The href is: ['href']"""
    response = client.chat.completions.create(
        # model="glm-4v-flash",
        model = "Qwen/Qwen2.5-7B-Instruct",
        messages =[
            {'role': 'user', 'content': question},
            {'role': 'system', 'content': systemPrompt}
        ],
        # timeout=30,
        # max_tokens=8192,
        # temperature=0.7,
        # logprobs=True,
        # top_logprobs=5,
        )
    logger.info(f"Token used: {response.usage.total_tokens},prompt_tokens: {response.usage.prompt_tokens},completion_tokens: {response.usage.completion_tokens}")

    return response.choices[0].message.content

def article_extraction(editorial, articles):
    answer = llm_call(f'''
    One editorial summaries one article. I am searching the article related to the editorial:
    {editorial}

    The atricles and their correspending hrefs are shown as follows:
    {articles}

    Please directly output the href of the article in square bracket and add 'https://www.nature.com' in front of it. 
    ''')
    return answer

def issue_article_extration(issue_url,path_article,path_articles,filename):
    path = path_article+filename+'.txt'
    content_dict = {}
    if os.path.exists(path):
        logger.info(f"{filename} article already exists")
        return
    # try:
        
    #     issue_pdf_url = editorial.find('a')['href']
    #     with open(path, "w") as file:
    #         file.write(issue_pdf_url)
    #         logger.info(f"{filename} article success")


    # except Exception as e:
    try:
        issue_response = requests.get(url=issue_url, headers=kv, timeout=120, proxies=proxies)
        issue_response.encoding = 'utf-8'
        issue_soup = BeautifulSoup(issue_response.text, 'html.parser')
        editorial = issue_soup.find('p', attrs={'data-promo-text-threshold': '560'})
        article_url = issue_soup.find_all('div', attrs={'class': 'c-card__body u-display-flex u-flex-direction-column'})
        a_href=[]
        for i in article_url:
            article_url = "https://www.nature.com"+i.find('a').get('href')
            article_authors = i.find_all('span', {'itemprop': 'name'})
            article_abstract = i.find('div',attrs={'itemprop':'description'})
            if not article_abstract:
                continue
            article_abstract = article_abstract.text.strip()
            article_authors = [author.text.strip() for author in article_authors]
            article_authors = ", ".join(article_authors)
            a_href.append(f"Author names: {article_authors}; Article link: {article_url}; Abstract of the article: {article_abstract}")
            content_dict[article_url] = article_abstract
        editorial=editorial.text
        articles = '\n\n'.join(str(element) for element in a_href)
        
        url=article_extraction(editorial,articles)
        logger.info(f"url: {url}")
        pattern = r"\['(.*?)'\]"
        matches = re.findall(pattern, url)
        with open(path, "w") as file:
            for i in matches:
                file.write(i+'\n')
        with open(path_articles+filename+'.json', "w") as json_file:
            json.dump(content_dict, json_file, indent=4)
        logger.info(f"{filename} artcle success")
        logger.info(f"{filename} other articles success")

    except Exception as e:
        logger.error(f"{filename} article missing,Error: {e}")

            
        
       
    
# For TEST
# issue_article_extration('https://www.nature.com/nature/volumes/502/issues/7473','test.txt')

def issue_editorial_extration(issue_url,path_editorial,filename):
    filename = path_editorial+filename
    if os.path.exists(filename):
        logger.info(f"{filename} editorial already exists")
        return
    try:
        issue_response = requests.get(url=issue_url, headers=kv, timeout=120, proxies=proxies)
        issue_response.encoding = 'utf-8'
        issue_soup = BeautifulSoup(issue_response.text, 'html.parser')
        issue_editorial_html = issue_soup.find('div', class_='app-volumes-cover__copy')
        issue_editorial = issue_editorial_html.get_text()
        
    except Exception :
        logger.error(f"{filename} editorial missing")
        return
    try:
        with open(filename, 'w') as file:
            file.write(issue_editorial)
        logger.info(f"{filename} editorial success")
    except Exception:
        logger.error(f"{filename} editorial missing")
# For TEST
# issue_editorial_extration('https://www.nature.com/nature/volumes/539/issues/7630','test.txt')
        
def issue_extract_cover(img_tag,path_cover,filename):
    filename = path_cover+filename
    if os.path.exists(filename):
        logger.info(f"{filename} cover already exists")
        return

    img_url = img_tag['src'].replace("w200", "w1000")
    try:
        img_response = requests.get(img_url, timeout=240, headers=kv, proxies=proxies)
        with open(filename, 'wb') as imgfile:
            imgfile.write(img_response.content)
            logger.info(f"{filename} cover success")
    except Exception:
        logger.error(f"{filename} cover missing")
        # start the loop for each volume and download editorials in each volume

 
def run_one_journal(start_volume,issues_num,journal,journal_url,table=None,end_volume=1):
    issues = 0
    
    path_article = f'./Article/{journal}/'
    path_editorial = f'./Story/{journal}/'
    path_cover = f'./Cover/{journal}/'
    path_articles = f'Other_Articles/{journal}/'
    if not os.path.exists(path_article):
        os.makedirs(path_article)
    if not os.path.exists(path_editorial):
        os.makedirs(path_editorial)
    if not os.path.exists(path_cover):
        os.makedirs(path_cover)
    if not os.path.exists(path_articles):
        os.makedirs(path_articles)
    while start_volume >= end_volume and issues < issues_num:
        # for debug
        if start_volume > 22:
            start_volume -=1
            continue
        datas = []
        logger.info(f"Current volume: {start_volume}, issues: {issues}")
        try:
            volume_url = journal_url + str(start_volume)
            logger.info(f"Volume URL: {volume_url}")
            volume_response = requests.get(url=volume_url, headers=kv, timeout=240, proxies=proxies)
        except Exception as e:
            logger.error(f"Error when getting volume {str(start_volume)}: {e}")
            start_volume -= 1
            continue
        
        try:
            volume_response.encoding = 'utf-8'
            volume_soup = BeautifulSoup(volume_response.text, 'html.parser')
            urls_ = volume_soup.find_all(
                'a',
                class_=
                'kill-hover flex-box-item'
            )

            ul_tag = volume_soup.find_all(
                'ul',
                class_=
                'ma0 clean-list grid-auto-fill grid-auto-fill-w220 very-small-column medium-row-gap'
            )
            if len(urls_) == 0:
                logger.info(f"No issues in volume {str(start_volume)}")
                start_volume -= 1
                continue
        except Exception as e:
            logger.error(f"Error when getting info in volume {str(start_volume)}: {e}")
            start_volume -= 1
            continue
        issues += len(urls_)
        
        # extract editorials in each issue
        for i,url_ in enumerate(urls_):
            try:
                issue_url = 'https://www.nature.com' + url_.get('href')
                filename = str(start_volume) + '_' + str(i+1)+'.txt'
                issue_editorial_extration(issue_url,path_editorial,filename)
                data = {
                    'Journal': journal,
                    'Volume': start_volume,
                    'Issue': i+1,
                    'cover story path': path_editorial+filename,
                }
                datas.append(data)
            except Exception as e:
                logger.error(f"Error when getting editorial in issue {str(i+1)}: {e}")
                data = {
                    'Journal': journal,
                    'Volume': start_volume,
                    'Issue': i+1,
                    'cover story path': '',
                }
                datas.append(data)

    
        # extract articles in each issue
        for i,url_ in enumerate(urls_):
            try:
                issue_url = 'https://www.nature.com' + url_.get('href')
                filename = str(start_volume) + '_' + str(i+1)
                issue_article_extration(issue_url,path_article,path_articles,filename)
                if datas[i]['Issue'] == i+1 :
                    datas[i]['articles path'] = path_articles+filename+'.json'
                    datas[i]['cover article path'] = path_article+filename+'.txt'
                else:
                    logger.error(f"article Error index: {i},Issue: {datas[i]['Issue']}")
                    logger.info(f"datas: {datas}")
            except Exception as e:
                logger.error(f"Error when getting article in issue {str(i+1)}: {e}")
                datas[i]['articles path'] = ''
                datas[i]['cover article path'] = ''

        # extract covers in each issue
        try:
            img_list = ul_tag[0].find_all("img")
            issue_number = 0
            logger.info(f"img_list:{img_list}")
            for img_tag in img_list:
                issue_number += 1
                try:
                    filename = str(start_volume) + '_' + str(issue_number) + '.png'
                    issue_extract_cover(img_tag,path_cover,filename)
                    if datas[issue_number-1]['Issue'] == issue_number:
                        datas[issue_number-1]['cover image path'] = path_cover+filename
                    else:
                        logger.error(f"cover Error index: {issue_number},Issue: {datas[issue_number-1]['Issue']}")
                except Exception as e:
                    logger.error(f"Error when getting cover in issue {str(issue_number)}: {e}")
                    datas[issue_number-1]['cover image path'] = ''
        except Exception as e:
            logger.error(f"Error when getting cover in issue {str(issue_number)}: {e}")
        logger.info(f"datas:{datas}")
        for issue_data in datas:
            table.add_data(issue_data['Journal'], 
                           issue_data['Volume'], 
                           issue_data['Issue'], 
                           issue_data['cover image path'], 
                           issue_data['cover story path'], 
                           issue_data['cover article path'], 
                           issue_data['articles path'])
        start_volume -= 1
    # wandb.log({f"{journal}": table})
def read_journal_data(csv_path: str) -> list:
    """
    读取CSV文件并处理期刊数据
    
    Args:
        csv_path: CSV文件路径
    
    Returns:
        list: 包含符合条件的期刊信息的列表
    """
    # 读取CSV文件
    df = pd.read_csv(csv_path)

    # 清理数据：去除空格，转换为字符串
    df['Journal'] = df['Journal'].str.strip()
    df['Link'] = df['Link'].str.strip()
    
    # 创建结果列表
    cell_journals = []
    
    # 遍历每一行
    for _, row in df.iterrows():
        journal = row['Journal']
        link = str(row['Link'])  # 转换为字符串以处理可能的nan值
        number = row['Number']
        
        # 检查link是否有效且是cell.com网站
        if isinstance(link, str) and 'nature.com' in link:
            cell_journals.append({
                'journal': journal,
                'link': link,
                'number': number
            })
    
    return cell_journals

if __name__ == "__main__":
    setup_logger()
    all_issues = 0
    end_volume = 1
    journal_data = read_journal_data('/home/ubuntu/scratch/mhjiang/CNS_cover/dataset_list.csv')
    logger.info(f"=================Start downloading=================")
    columns = ['Journal','Volume','Issue','cover image path','cover story path','cover article path','articles path']
    table = wandb.Table(columns=columns)
    for journal in tqdm(journal_data):
        if not journal['journal'] == 'NATURE MEDICINE':
            continue
        try:
            volumes = extract_volumes(journal['link'])
        except Exception as e:
            logger.error(f"Error getting volumes for {journal['journal']}: {e}")
            continue
        if not journal['link'][-1] == '/':
            journal['link'] = journal['link'] + '/'
        logger.info(f"*******************Volumes: {volumes}, URL: {journal['link']}, Number: {journal['number']}*******************")
        try:
            run_one_journal(volumes,int(journal['number']),journal['journal'],journal['link'],table,end_volume)
        except Exception as e:
            logger.error(f"Error running one journal for {journal['journal']}: {e}")
            continue
        all_issues += journal['number']
    wandb.log({'Nature': table})
    
