import pandas as pd
import argparse

def find_matching_records(small_csv: str, large_csv: str, output_csv: str):
    """
    从大CSV文件中找出与小CSV文件中journal和id匹配的记录
    
    Args:
        small_csv: 小CSV文件路径
        large_csv: 大CSV文件路径
        output_csv: 输出CSV文件路径
    """
    try:
        # 读取CSV文件
        print(f"正在读取小CSV文件: {small_csv}")
        small_df = pd.read_csv(small_csv)
        print(f"小CSV文件中有 {len(small_df)} 条记录")
        
        print(f"正在读取大CSV文件: {large_csv}")
        large_df = pd.read_csv(large_csv)
        print(f"大CSV文件中有 {len(large_df)} 条记录")
        
        # 确保id列的类型一致
        small_df['id'] = small_df['id'].astype(str)
        large_df['id'] = large_df['id'].astype(str)
        
        # 使用merge找出匹配的记录
        # 使用inner join确保只保留两个DataFrame中都存在的记录
        matched_records = pd.merge(
            large_df,
            small_df[['journal', 'id']],
            on=['journal', 'id'],
            how='inner'
        )
        
        # 检查是否找到匹配记录
        if matched_records.empty:
            print("没有找到匹配的记录！")
            return
        
        # 保存结果
        matched_records.to_csv(output_csv, index=False)
        
        print(f"\n处理完成！")
        print(f"找到 {len(matched_records)} 条匹配记录")
        print(f"结果已保存到: {output_csv}")
        
        # 检查是否有未匹配的记录
        merged = pd.merge(
            small_df[['journal', 'id']],
            large_df[['journal', 'id']],
            on=['journal', 'id'],
            how='left',
            indicator=True
        )
        unmatched = merged[merged['_merge'] == 'left_only']
        
        if not unmatched.empty:
            print(f"\n警告：有 {len(unmatched)} 条记录在大CSV文件中未找到匹配：")
            for _, row in unmatched.iterrows():
                print(f"journal: {row['journal']}, id: {row['id']}")
        
    except Exception as e:
        print(f"处理过程中出错: {e}")
        raise

if __name__ == "__main__":
    mac_2025 = "./CNS_cover/Data/Understanding/MAC_2025/image2text_given/full_dataset.csv"
    total_2026 = "./CNS_cover/Data/Understanding/Total_2026/image2text_given/full_dataset.csv"
    output = "./CNS_cover/Data/Understanding/MAC_2026/image2text_given/full_dataset.csv"
    find_matching_records(mac_2025, total_2026, output)