import os
import sys
import pandas as pd
import argparse
from tqdm import tqdm
import logging

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("dataset_construction.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def construct_dataset(base_path, output_path, journal_output_path):
    """
    构建数据集，以Cover为索引，整理出包含所有相关信息的CSV文件
    
    Args:
        base_path: 包含Article、Cover、Story和Other_Articles文件夹的根目录
        output_path: 输出CSV文件的路径
    """
    # 定义各个子目录路径
    cover_path = os.path.join(base_path, 'Cover')
    story_path = os.path.join(base_path, 'Story')
    article_path = os.path.join(base_path, 'Article')
    other_articles_path = os.path.join(base_path, 'Other_Articles')
    
    # 检查目录是否存在
    if not os.path.exists(cover_path):
        logger.error(f"Cover目录不存在: {cover_path}")
        return
    
    # 创建数据集记录列表
    records = []
    empty_records = []
    
    # 获取所有期刊名称
    journals = [j for j in os.listdir(cover_path) if os.path.isdir(os.path.join(cover_path, j))]
    logger.info(f"找到 {len(journals)} 个期刊")
    
    # 遍历每个期刊
    for journal in tqdm(journals, desc="处理期刊"):
        journal_cover_path = os.path.join(cover_path, journal)
        
        # 获取该期刊下的所有封面
        covers = [c for c in os.listdir(journal_cover_path) if c.endswith('.png')]
        
        # 遍历每个封面
        for cover in covers:
            # 提取卷号和期号
            issue_id = cover.split('.')[0]  # 例如: "4_10"
            try:
                volume, issue = issue_id.split('_')
            except ValueError:
                logger.warning(f"无法解析卷号和期号: {cover}")
                continue
            
            # 构建各个路径
            cover_file_path = os.path.join(cover_path, journal, cover)
            story_file_path = os.path.join(story_path, journal, f"{issue_id}.txt")
            article_file_path = os.path.join(article_path, journal, f"{issue_id}.txt")
            other_articles_file_path = os.path.join(other_articles_path, journal, f"{issue_id}.json")
            
            # 检查文件是否存在
            has_story = os.path.exists(story_file_path)
            has_article = os.path.exists(article_file_path)
            has_other_articles = os.path.exists(other_articles_file_path)
            
            # 创建记录
            record = {
                'Journal': journal,
                'Volume': volume,
                'Issue': issue,
                'cover image path': f"./Cover/{journal}/{issue_id}.png",
                'cover story path': f"./Story/{journal}/{issue_id}.txt" if has_story else "",
                'cover article path': f"./Article/{journal}/{issue_id}.txt" if has_article else "",
                'articles path': f"./Other_Articles/{journal}/{issue_id}.json" if has_other_articles else ""
            }
            
            # 添加到记录列表
            records.append(record)
            
            # 如果缺少任何一个文件，添加到空记录列表
            if not (has_story):
                empty_records.append(record)
    
    # 创建DataFrame并保存
    df = pd.DataFrame(records)
    empty_df = pd.DataFrame(empty_records)
    
    # 确保输出目录存在
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)
    
    # 保存完整数据集
    df.to_csv(output_path, index=False)
    logger.info(f"已保存完整数据集到 {output_path}，共 {len(df)} 条记录")
    
    # 保存空记录数据集
    empty_output_path = os.path.join(output_dir, 'empty_records.csv')
    empty_df.to_csv(empty_output_path, index=False)
    logger.info(f"已保存空记录数据集到 {empty_output_path}，共 {len(empty_df)} 条记录")
    
    # 打印统计信息
    logger.info(f"总记录数: {len(df)}")
    logger.info(f"完整记录数: {len(df) - len(empty_df)}")
    logger.info(f"不完整记录数: {len(empty_df)}")
    
    # 创建完整记录的DataFrame（排除空记录）
    complete_df = df[~df['cover story path'].isna() & (df['cover story path'] != "")]
    
    # 按期刊统计完整记录
    journal_stats = complete_df.groupby('Journal').size().reset_index()
    journal_stats.columns = ['Journal', 'Complete_Records']
    
    # 保存期刊统计到CSV
    journal_stats.to_csv(journal_output_path, index=False)
    
    # 打印完整记录的期刊统计
    logger.info("各期刊完整记录数:")
    for _, row in journal_stats.iterrows():
        logger.info(f"  {row['Journal']}: {row['Complete_Records']}")
    
    return df, empty_df, journal_stats

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Construct the dataset based on the cover")
    parser.add_argument('--base_path', type=str, required=True, help="the path of the dataset")
    parser.add_argument('--output', type=str, default="./Data/dataset.csv", help="the path of the output csv file")
    parser.add_argument('--journal_path', type=str, default="./Data/journal_statistics.csv", help="the path of the journal statistics csv file")
    
    args = parser.parse_args()
    
    # 构建数据集
    df, empty_df, journal_stats = construct_dataset(args.base_path, args.output, args.journal_path)
    print(journal_stats)