"""
Kaggle 竞赛语料库自动抓取工具
从 Kaggle API 获取竞赛描述、讨论和 Notebook 解决方案
"""

import json
import os
from pathlib import Path
from typing import List, Dict, Optional
import time
from datetime import datetime

try:
    from kaggle.api.kaggle_api_extended import KaggleApi
    KAGGLE_AVAILABLE = True
except ImportError:
    KAGGLE_AVAILABLE = False
    print("警告: kaggle 包未安装。请运行: pip install kaggle")

import logger

logger.init("kaggle_corpus_fetcher")


class KaggleCorpusFetcher:
    """从 Kaggle API 自动获取竞赛语料库"""
    
    def __init__(self, output_path: str = "knowledgeBase/case_library.jsonl"):
        if not KAGGLE_AVAILABLE:
            raise ImportError("需要安装 kaggle 包: pip install kaggle")
        
        self.api = KaggleApi()
        self.api.authenticate()
        self.output_path = Path(output_path)
        self.output_path.parent.mkdir(parents=True, exist_ok=True)
    
    def fetch_competition_details(self, competition_slug: str) -> Optional[Dict]:
        """
        获取单个竞赛的详细信息
        
        Args:
            competition_slug: 竞赛标识符（如 'titanic'）
        
        Returns:
            包含竞赛信息的字典
        """
        try:
            # 1. 获取竞赛基本信息（使用正确的 API 方法）
            competitions = self.api.competitions_list(search=competition_slug)
            comp = None
            for c in competitions:
                # ref 是完整URL，需要提取最后的 slug
                ref_slug = c.ref.split('/')[-1] if '/' in c.ref else c.ref
                if ref_slug == competition_slug:
                    comp = c
                    break
            
            if comp is None:
                logger.error(f"未找到竞赛: {competition_slug}")
                return None
            
            # 2. 获取竞赛描述
            description = comp.description if hasattr(comp, 'description') else ""
            
            # 3. 从 Notebooks 中提取解决方案
            solutions = self._extract_notebook_solutions(competition_slug)
            
            competition_data = {
                "title": comp.title if hasattr(comp, 'title') else competition_slug,
                "description": description[:1000] if description else f"{comp.title} 竞赛",
                "solutions": solutions,
                "tags": self._extract_tags(comp),
            }
            
            logger.info(f"成功获取竞赛 {competition_slug} 的详细信息")
            return competition_data
            
        except Exception as e:
            logger.error(f"获取竞赛 {competition_slug} 失败: {e}")
            return None
    
    def _extract_notebook_solutions(self, competition_slug: str, max_notebooks: int = 3) -> List[str]:
        """
        从竞赛的获奖 Notebooks 中提取解决方案摘要
        
        Args:
            competition_slug: 竞赛标识符
            max_notebooks: 最多提取的 Notebook 数量
        
        Returns:
            解决方案文本列表
        """
        solutions = []
        
        try:
            # 获取竞赛的热门 Kernels/Notebooks
            kernels = self.api.kernels_list(
                competition=competition_slug,
                sort_by='voteCount',
                page_size=max_notebooks * 2  # 多获取一些，因为有些可能无法下载
            )
            
            for kernel in kernels[:max_notebooks]:
                try:
                    # 提取 Notebook 摘要
                    solution_parts = []
                    solution_parts.append(f"**{kernel.title}** (得票: {kernel.totalVotes})")
                    
                    # 添加作者信息
                    if hasattr(kernel, 'author'):
                        solution_parts.append(f"作者: {kernel.author}")
                    
                    # 添加语言信息
                    if hasattr(kernel, 'language'):
                        solution_parts.append(f"语言: {kernel.language}")
                    
                    # 添加描述信息
                    if hasattr(kernel, 'description') and kernel.description:
                        desc = kernel.description.strip()
                        if desc and len(desc) > 20:  # 只保留有意义的描述
                            solution_parts.append(f"描述: {desc[:400]}")
                    
                    solutions.append("\n".join(solution_parts))
                    
                    # 避免频繁请求
                    time.sleep(0.3)
                    
                except Exception as e:
                    logger.warning(f"提取 Notebook {kernel.ref} 失败: {e}")
                    continue
            
            if not solutions:
                # 如果没有 Notebooks，尝试使用竞赛的 leaderboard 信息
                solutions.append(f"该竞赛暂无可用的 Notebook 解决方案，建议访问 Kaggle 网站查看讨论区和 Leaderboard")
                
        except Exception as e:
            logger.warning(f"获取 Notebooks 列表失败: {e}")
            solutions.append("暂无可用的解决方案描述")
        
        return solutions
    
    def _extract_tags(self, comp) -> List[str]:
        """从竞赛对象中提取标签"""
        tags = []
        
        # 添加竞赛类型标签
        if hasattr(comp, 'categories'):
            # 转换为字符串
            for cat in comp.categories:
                tags.append(str(cat))
        
        # 添加数据类型标签
        if hasattr(comp, 'tags'):
            # 转换为字符串
            for tag in comp.tags:
                if hasattr(tag, 'name'):
                    tags.append(tag.name)
                else:
                    tags.append(str(tag))
        
        return tags[:5]  # 最多5个标签
    
    
    def fetch_multiple_competitions(
        self, 
        competition_slugs: List[str],
        append: bool = False
    ) -> int:
        """
        批量获取多个竞赛的语料库
        
        Args:
            competition_slugs: 竞赛标识符列表
            append: 是否追加到现有文件（True）还是覆盖（False）
        
        Returns:
            成功获取的竞赛数量
        """
        mode = 'a' if append else 'w'
        success_count = 0
        
        with open(self.output_path, mode, encoding='utf-8') as f:
            for slug in competition_slugs:
                logger.info(f"正在获取竞赛: {slug}")
                
                comp_data = self.fetch_competition_details(slug)
                if comp_data:
                    f.write(json.dumps(comp_data, ensure_ascii=False) + '\n')
                    success_count += 1
                    logger.success(f"✓ {slug} 已保存")
                else:
                    logger.error(f"✗ {slug} 获取失败")
                
                # 避免触发 API 限流
                time.sleep(1)
        
        logger.success(f"成功获取 {success_count}/{len(competition_slugs)} 个竞赛的语料库")
        return success_count
    
    def fetch_popular_competitions(self, category: str = "featured", max_count: int = 10, append: bool = False) -> int:
        """
        自动获取热门竞赛
        
        Args:
            category: 竞赛类别（'featured', 'research', 'recruitment', 'gettingStarted', 'masters', 'playground'）
            max_count: 最多获取的竞赛数量
        
        Returns:
            成功获取的竞赛数量
        """
        try:
            # 获取竞赛列表
            # 注意: Kaggle API 通常只返回正在进行的竞赛
            competitions = self.api.competitions_list(
                category=category,
                sort_by='latestDeadline',
                page=1
            )
            
            # 选取前 N 个，提取 slug
            selected_slugs = []
            for comp in competitions[:max_count]:
                slug = comp.ref.split('/')[-1] if '/' in comp.ref else comp.ref
                selected_slugs.append(slug)
            
            logger.info(f"将获取 {len(selected_slugs)} 个 {category} 类别的竞赛")
            
            return self.fetch_multiple_competitions(selected_slugs, append=append)
            
        except Exception as e:
            logger.error(f"获取热门竞赛列表失败: {e}")
            return 0


def main():
    """命令行工具：批量获取 Kaggle 竞赛语料库"""
    
    print("=" * 60)
    print("Kaggle 竞赛语料库自动抓取工具")
    print("=" * 60)
    
    if not KAGGLE_AVAILABLE:
        print("\n错误: 未安装 kaggle 包")
        print("请运行: pip install kaggle")
        print("并在 ~/.kaggle/kaggle.json 放置你的 API Token")
        return
    
    fetcher = KaggleCorpusFetcher()
    
    # 推荐的经典竞赛列表（精选当前可用的）
    classic_competitions = [
        "titanic",                    # 二分类 - Titanic 生存预测（永久开放）
        "house-prices-advanced-regression-techniques",  # 回归（永久开放）
        "digit-recognizer",           # 多分类 - MNIST 手写数字（永久开放）
        "nlp-getting-started",        # NLP - 灾难推文分类（永久开放）
        "spaceship-titanic",          # 二分类 - 太空泰坦尼克（新手友好）
    ]
    
    print("\n将从以下竞赛中提取语料库:")
    for i, comp in enumerate(classic_competitions, 1):
        print(f"  {i}. {comp}")
    
    print(f"\n输出路径: {fetcher.output_path.absolute()}")
    
    choice = input("\n是否继续? [y/N]: ").strip().lower()
    if choice != 'y':
        print("已取消")
        return
    
    # 开始获取
    print("\n开始获取竞赛语料库...")
    success_count = fetcher.fetch_multiple_competitions(classic_competitions, append=False)
    
    print(f"\n✓ 完成! 成功获取 {success_count} 个竞赛的语料库")
    print(f"✓ 语料库已保存至: {fetcher.output_path.absolute()}")
    print("\n提示: 现在可以运行 test_kaggle_retrieval.py 测试检索功能")


if __name__ == "__main__":
    main()
    # fetcher = KaggleCorpusFetcher()
    # success_count = fetcher.fetch_popular_competitions(category="featured", max_count=6, append=True)

