
import re
import os
import json
import logging
from bs4 import BeautifulSoup
import requests
from playwright.sync_api import sync_playwright
from tqdm import tqdm

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("article_extraction.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("article_extractor")

def extract_abstract_from_Nature_html(html_content):
    """
    从HTML元素中提取abstract内容
    
    Args:
        html_content: 包含abstract的HTML字符串
        
    Returns:
        str: 提取的abstract文本内容
    """
    try:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        abstract_section = soup.find('section', attrs={'data-title': 'Abstract'})
        if abstract_section:
            # 查找abstract内容
            abstract_content = abstract_section.find('div', id=lambda x: x and x.endswith('-content'))
            if not abstract_content:
                logger.warning("在Nature文章中找到abstract section但没有找到content div")
                return None
            # 获取所有段落文本
            paragraphs = abstract_content.find_all('p')
            abstract_text = ' '.join([p.get_text() for p in paragraphs])
        else:
            abstract_section1 = soup.find('p', class_='article__teaser')
            abstract_section2 = soup.find('meta', attrs={'name': 'description'})
            if not abstract_section1 and not abstract_section2:
                logger.warning("在Nature文章中没有找到任何abstract相关元素")
                return None
            
            abstract_text1 = abstract_section1.get_text(strip=True) if abstract_section1 else ""
            abstract_text2 = abstract_section2.get('content') if abstract_section2 else ""

            if len(abstract_text1) < 15:
                abstract_text = abstract_text2
            else:
                abstract_text = abstract_text2 + abstract_text1 

        return abstract_text
    except Exception as e:
        logger.error(f"提取Nature abstract时出错: {str(e)}")
        return None

def extract_abstract_from_Cell_html(html_content):
    """
    从HTML元素中提取abstract文本
    
    Args:
        html_content: 包含abstract的HTML字符串
        
    Returns:
        str: 提取的abstract文本内容，保留下标格式
    """
    try:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 查找abstract部分
        abstract_section = soup.find('section', id='author-abstract')
        if not abstract_section:
            logger.warning("在Cell文章中没有找到author-abstract部分")
            return None
        
        # 查找abstract内容 - 通常在div元素中
        abstract_divs = abstract_section.find_all('div', role='paragraph')
        if not abstract_divs:
            # 如果没有找到特定的div，尝试获取section中除标题外的所有文本
            # 先移除标题
            for heading in abstract_section.find_all(['h1', 'h2', 'h3']):
                heading.extract()
            abstract_text = abstract_section.get_text(strip=True)
        else:
            # 合并所有段落
            abstract_text = ' '.join([div.get_text(strip=True) for div in abstract_divs])
        
        # 处理下标和上标
        # 在BeautifulSoup处理前，先将HTML中的sub和sup标签转换为特殊标记
        html_content = html_content.replace('<sub>', '§SUB§').replace('</sub>', '§/SUB§')
        html_content = html_content.replace('<sup>', '§SUP§').replace('</sup>', '§/SUP§')
        
        # 重新解析处理过的HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        abstract_section = soup.find('section', id='author-abstract')
        if abstract_section:
            abstract_divs = abstract_section.find_all('div', role='paragraph')
            if abstract_divs:
                abstract_text = ' '.join([div.get_text(strip=True) for div in abstract_divs])
        
        # 将特殊标记转换为可读格式
        abstract_text = abstract_text.replace('§SUB§', '_').replace('§/SUB§', '')
        abstract_text = abstract_text.replace('§SUP§', '^').replace('§/SUP§', '')
        
        return abstract_text
    except Exception as e:
        logger.error(f"提取Cell abstract时出错: {str(e)}")
        return None

def extract_abstract_from_Science_html(html_content):
    """
    从HTML中提取第一个paragraph的文本
    
    Args:
        html_content: HTML字符串
        
    Returns:
        str: 第一个paragraph的文本内容
    """
    try:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 查找所有具有role="paragraph"属性的div元素
        paragraphs = soup.find_all('div', attrs={'role': 'paragraph'})
        
        # 如果找到了paragraph，返回第一个的文本内容
        if paragraphs and len(paragraphs) > 0:
            # 获取第一个paragraph的文本
            first_paragraph_text = paragraphs[0].get_text()
            
            # 清理文本（移除多余空格等）
            first_paragraph_text = re.sub(r'\s+', ' ', first_paragraph_text).strip()
            
            return first_paragraph_text
        else:
            logger.warning("在Science文章中没有找到paragraph元素")
            return None
    except Exception as e:
        logger.error(f"提取Science abstract时出错: {str(e)}")
        return None

def extract_abstract_from_ACS_html(html_content):
    """
    从HTML元素中提取abstract文本
    
    Args:
        html_content: 包含abstract的HTML字符串
        
    Returns:
        str: 提取的abstract文本内容
    """
    try:
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 查找abstract部分
        abstract_div = soup.find('div', class_='article_abstract')
        if not abstract_div:
            logger.warning("在ACS文章中没有找到article_abstract div")
            return None
        
        # 查找abstract内容
        abstract_content = abstract_div.find('div', class_='article_abstract-content')
        if not abstract_content:
            logger.warning("在ACS文章中没有找到article_abstract-content div")
            return None
        
        # 提取Conspectus标题
        conspectus_title = abstract_content.find('h6', class_='article_abstract-sub-title')
        conspectus_text = conspectus_title.get_text().strip() if conspectus_title else ""
        
        # 获取所有段落文本
        paragraphs = abstract_content.find_all('p', class_='articleBody_abstractText')
        
        # 组合文本
        abstract_text = ""
        if conspectus_text:
            abstract_text += conspectus_text + "\n\n"
        
        abstract_text += "\n\n".join([p.get_text().strip() for p in paragraphs])
        
        return abstract_text
    except Exception as e:
        logger.error(f"提取ACS abstract时出错: {str(e)}")
        return None

def request_web(year_url):
    """
    使用Playwright请求网页内容
    
    Args:
        year_url: 要请求的URL
        
    Returns:
        str: 网页内容
    """
    try:
        # 创建一个 Playwright 实例
        proxies_playwright = {
            "server": "xxxx",
        }
        headers2 = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
            'sec-ch-ua-arch': '"arm"',
            'sec-ch-ua-bitness': '"64"',
            'sec-ch-ua-full-version': '"131.0.6778.265"',
            'sec-ch-ua-full-version-list': '"Google Chrome";v="131.0.6778.265", "Chromium";v="131.0.6778.265", "Not_A Brand";v="24.0.0.0"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-model': '""',
            'sec-ch-ua-platform': '"macOS"',
            'sec-ch-ua-platform-version': '"14.4.0"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        }
        
        logger.info(f"开始请求网页: {year_url}")
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True, proxy=proxies_playwright)
            context = browser.new_context(
                # 基本配置
                viewport={'width': 1920, 'height': 1080},
                accept_downloads=True,
                # User Agent
                user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                
                # 额外的 HTTP headers
                extra_http_headers={
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1',
                    'Sec-Fetch-Dest': 'document',
                    'Sec-Fetch-Mode': 'navigate',
                    'Sec-Fetch-Site': 'none',
                    'Sec-Fetch-User': '?1'
                },
                
                # 地理位置、时区等配置
                locale='en-US',
                timezone_id='America/New_York',
                geolocation={'latitude': 40.7128, 'longitude': -74.0060},
                permissions=['geolocation']
            )
            page = context.new_page()
            page.set_extra_http_headers(headers2)
            
            # 需要添加更多的等待和交互逻辑
            logger.info(f"导航到URL: {year_url}")
            page.goto(year_url, wait_until='networkidle', timeout=60000)
            
            # 添加等待特定元素出现的逻辑
            try:
                # 处理可能出现的 Cookie 提示
                try:
                    cookie_button = page.locator('#onetrust-accept-btn-handler')
                    if cookie_button.is_visible(timeout=5000):
                        logger.info("点击Cookie接受按钮")
                        cookie_button.click()
                except Exception as cookie_e:
                    logger.debug(f"处理Cookie提示时出错或没有Cookie提示: {str(cookie_e)}")
                
                page_content = page.content()
                page.screenshot(path="page2.png", full_page=True)
                logger.info(f"成功获取页面内容: {year_url}")
            except Exception as e:
                logger.error(f"加载页面时出错: {str(e)}")
                page.screenshot(path=f"page_{year_url.replace('://', '_').replace('/', '_')}.png", full_page=True)
                page_content = ""
                
            browser.close()
        return page_content
    except Exception as e:
        logger.error(f"请求网页时出错: {str(e)}")
        return ""

def split_text_into_three_paragraphs(text):
    """
    将文本分割为三个段落
    
    Args:
        text: 要分割的文本
        
    Returns:
        list: 包含三个段落的列表
    """
    try:
        # 找出所有句子结束的位置（句号后跟空格或结尾）
        sentence_endings = [m.end() for m in re.finditer(r'\.(?=\s|$)', text)]
        
        if len(sentence_endings) < 3:
            logger.warning(f"文本中句子数量少于3 ({len(sentence_endings)}个句子)，返回原文本")
            return [text]  # 如果句子数量少于3，则返回原文本
        
        # 计算理想的段落长度（总长度的三分之一）
        total_length = len(text)
        ideal_paragraph_length = total_length // 3
        
        # 找出最接近理想长度三分之一和三分之二处的句子结束位置
        first_break_idx = min(sentence_endings, key=lambda x: abs(x - ideal_paragraph_length))
        
        # 在剩余文本中找出最接近理想长度三分之二处的句子结束位置
        second_break_idx = min(
            [x for x in sentence_endings if x > first_break_idx], 
            key=lambda x: abs(x - (2 * ideal_paragraph_length))
        )
        
        # 分割文本
        first_paragraph = text[:first_break_idx].strip()
        second_paragraph = text[first_break_idx:second_break_idx].strip()
        third_paragraph = text[second_break_idx:].strip()
        
        return [first_paragraph, second_paragraph, third_paragraph]
    except Exception as e:
        logger.error(f"分割文本时出错: {str(e)}")
        return [text]

def read_text_file(file_path):
    """
    从文本文件中读取内容，并以换行符为分隔符返回一个列表
    
    Args:
        file_path: 文本文件的路径
        
    Returns:
        list: 文件内容按行分割的列表，去除空行和前后空白
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # 读取所有行，去除每行的前后空白，并过滤掉空行
            lines = [line.strip() for line in file.readlines() if line.strip()]
            return lines
    except Exception as e:
        logger.error(f"读取文件时出错 {file_path}: {str(e)}")
        return []
    
def get_distraction_abstract(url, journal):
    """
    根据期刊类型获取文章摘要
    
    Args:
        url: 文章URL
        journal: 期刊类型
        
    Returns:
        str: 提取的摘要文本
    """
    try:
        logger.info(f"开始获取 {journal} 文章摘要: {url}")
        proxies = {
            'http': 'xxxx',
            'https': 'xxxx',
        }
        
        kv = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
        }
        
        if journal == 'Nature':
            # 提取html
            response = requests.get(url, headers=kv, proxies=proxies)
            response.encoding = 'utf-8'
            abstract = extract_abstract_from_Nature_html(response.text)
        elif journal == 'Cell':
            page_content = request_web(url)
            abstract = extract_abstract_from_Cell_html(page_content)
        elif journal == 'Science':
            page_content = request_web(url)
            with open('debug_content.html', 'w') as file:
                file.write(page_content)
            abstract = extract_abstract_from_Science_html(page_content)
        elif journal == 'ACS':
            page_content = request_web(url)
            abstract = extract_abstract_from_ACS_html(page_content)
        else:
            logger.error(f"不支持的期刊类型: {journal}")
            return None
        
        if abstract:
            logger.info(f"成功获取摘要: {url}")
        else:
            logger.warning(f"未能获取摘要: {url}")
            
        return abstract
    except Exception as e:
        logger.error(f"获取摘要时出错 {url}: {str(e)}")
        return None

def crawl_all_abstracts(base_path, journal_type='Nature', output_dir=None):
    """
    爬取base_path下的Article文件夹中所有txt文件中网址的abstract
    
    Args:
        base_path: 基础路径
        journal_type: 期刊类型
        output_dir: 输出目录，默认为base_path下的Abstract文件夹
    
    Returns:
        dict: 包含所有爬取结果的字典
    """
    try:
        logger.info(f"开始爬取 {journal_type} 文章摘要，基础路径: {base_path}")
        
        # 设置默认输出目录为base_path下的Abstract文件夹
        if output_dir is None:
            output_dir = os.path.join(base_path, "Description")
        
        # 创建输出目录
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"输出目录: {output_dir}")
        
        # 设置Article文件夹路径
        article_path = os.path.join(base_path, "Article")
        if not os.path.exists(article_path):
            logger.error(f"错误: Article文件夹不存在于 {base_path}")
            return {}
        
        # 结果字典
        results = {}
        success_count = 0
        failure_count = 0
        
        # 遍历Article文件夹下的所有期刊目录
        for journal_name in os.listdir(article_path):
            journal_path = os.path.join(article_path, journal_name)
            
            # 跳过非目录
            if not os.path.isdir(journal_path):
                continue
            
            logger.info(f"处理期刊: {journal_name}")
            
            # 为每个期刊创建对应的输出目录
            journal_output_dir = os.path.join(output_dir, journal_name)
            os.makedirs(journal_output_dir, exist_ok=True)
            
            # 遍历期刊下的所有子目录和文件
            for root, dirs, files in os.walk(journal_path):
                # 处理当前目录下的所有txt文件
                txt_files = [f for f in files if f.endswith('.txt')]
                
                if not txt_files:
                    continue
                    
                # 创建相对路径的输出目录
                rel_dir = os.path.relpath(root, journal_path)
                if rel_dir != '.':
                    current_output_dir = os.path.join(journal_output_dir, rel_dir)
                    os.makedirs(current_output_dir, exist_ok=True)
                else:
                    current_output_dir = journal_output_dir
                
                for txt_file in tqdm(txt_files, desc=f"处理 {os.path.relpath(root, article_path)} 中的文件"):
                    try:
                        file_path = os.path.join(root, txt_file)
                        relative_path = os.path.relpath(file_path, article_path)
                        output_file = os.path.join(current_output_dir, txt_file)
                        if os.path.exists(output_file):
                            logger.info(f"已存在文件: {output_file}")
                            continue
                        # 读取文件内容
                        lines = read_text_file(file_path)
                        
                        if not lines:
                            logger.warning(f"未找到URL: {relative_path}")
                            failure_count += 1
                            continue
                        
                        # 确定期刊类型
                        url = lines[0]  # 使用第一个URL          
                        
                        # 获取abstract
                        abstract = get_distraction_abstract(url, journal_type)
                        
                        if abstract:
                            # 保存到txt文件
                            
                            with open(output_file, 'w', encoding='utf-8') as f:
                                f.write(abstract)
                            
                            success_count += 1
                        else:
                            logger.warning(f"获取abstract失败: {url}")
                            failure_count += 1
                    except Exception as e:
                        logger.error(f"处理文件时出错 {txt_file}: {str(e)}")
                        failure_count += 1
        
        # 保存所有结果到一个JSON文件（用于后续处理）
        all_results_file = os.path.join(output_dir, "all_abstracts.json")
        with open(all_results_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        logger.info(f"处理完成，成功: {success_count}，失败: {failure_count}，结果保存在 {output_dir}")
        return results
    except Exception as e:
        logger.error(f"爬取摘要时出错: {str(e)}")
        return {}

if __name__ == "__main__":
    try:
        logger.info("开始执行脚本")
        crawl_all_abstracts("./CNS_cover/Nature", "Nature")
        logger.info("脚本执行完成")
    except Exception as e:
        logger.error(f"脚本执行出错: {str(e)}")