#!/usr/bin/env python3
"""
Workflow Tools for Kaggle Competition Management
Specialized tools for reading, sorting, and extracting detailed competition information.
"""

import json
import sys
import os
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass


class WorkflowToolsError(Exception):
    """Base exception for WorkflowTools operations."""
    pass


@dataclass
class CompetitionInfo:
    """Structured competition information."""
    title: str
    url: str
    name: str
    subtitle: str
    size: str
    tags: List[str]
    usability_rating: float
    detailed_description: str
    license_info: Dict[str, Any]
    
    # Additional metadata
    download_count: int = 0
    created_date: str = ""
    last_updated: str = ""
    owner: str = ""
    features: List[str] = None
    usage_scenarios: List[str] = None
    
    def __post_init__(self):
        if self.features is None:
            self.features = []
        if self.usage_scenarios is None:
            self.usage_scenarios = []


class WorkflowTools:
    """Specialized tools for Kaggle competition workflow management."""
    
    def __init__(self, competitions_file: str = "datasets_info.json", work_directory: Optional[str] = None):
        """Initialize with competitions data file."""
        self.competitions_file = Path(competitions_file)
        self.work_directory = Path(work_directory) if work_directory else Path.cwd()
        self.competitions_data = self._load_competitions()
        self._sorted_cache = {}
    
    def _parse_size(self, size_str: str) -> float:
        """Parse size string to float in KB."""
        try:
            size_str = size_str.strip()
            if "MB" in size_str:
                return float(size_str.replace("MB", "").strip()) * 1024
            elif "KB" in size_str:
                return float(size_str.replace("KB", "").strip())
            elif "GB" in size_str:
                return float(size_str.replace("GB", "").strip()) * 1024 * 1024
            else:
                return 0.0
        except:
            return 0.0
    
    def _load_competitions(self) -> List[Dict]:
        """Load competitions data from JSON file."""
        try:
            with open(self.competitions_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if not isinstance(data, list):
                    raise WorkflowToolsError("Competitions file must contain a list of competitions")
                
                # Parse size strings to floats for all competitions at loading time
                for comp in data:
                    size_str = comp.get("size", "0 KB")
                    comp["size_kb"] = self._parse_size(size_str)
                
                return data
        except FileNotFoundError:
            raise WorkflowToolsError(f"Competitions file not found: {self.competitions_file}")
        except json.JSONDecodeError as e:
            raise WorkflowToolsError(f"Invalid JSON in competitions file: {e}")
    
    def download_competition_data(self, competition_name: str, target_directory: Optional[str] = None) -> Dict[str, Any]:
        """
        Download Kaggle competition dataset to specified directory.
        First tries kaggle CLI, then falls back to kagglehub if that fails.
        
        Args:
            competition_name: Name of the Kaggle competition/dataset
            target_directory: Target directory for download (optional)
            
        Returns:
            Dict with download status and details
        """
        try:
            download_dir = Path(target_directory) if target_directory else self.work_directory
            download_dir.mkdir(parents=True, exist_ok=True)
            
            # First try kaggle CLI
            cmd = ["kaggle", "datasets", "download", competition_name, "-p", str(download_dir), "--unzip"]
            
            # Execute download with timeout
            result = subprocess.run(
                cmd,
                cwd=download_dir,
                capture_output=True,
                text=True,
                timeout=3600
            )
            
            if result.returncode == 0:
                print(f"Dataset downloaded and extracted successfully: {competition_name}")
                return {
                    "success": True,
                    "competition": competition_name,
                    "download_directory": str(download_dir),
                    "method": "kaggle_cli",
                    "message": "Dataset downloaded and extracted successfully"
                }
            else:
                print(f"Kaggle CLI download failed! Trying kagglehub...")
                print(f"Return code: {result.returncode}")
                print(f"STDOUT: {result.stdout}")
                print(f"STDERR: {result.stderr}")
                
                # Fall back to kagglehub
                try:
                    import kagglehub
                    print(f"Using kagglehub to download: {competition_name}")
                    
                    # Download the latest version using kagglehub
                    downloaded_path = kagglehub.dataset_download(competition_name, path=str(download_dir))
                    
                    print(f"Dataset downloaded successfully with kagglehub: {competition_name}")
                    print(f"Downloaded to: {downloaded_path}")
                    
                    return {
                        "success": True,
                        "competition": competition_name,
                        "download_directory": downloaded_path,
                        "method": "kagglehub",
                        "message": "Dataset downloaded successfully with kagglehub"
                    }
                    
                except ImportError:
                    error_msg = "kagglehub not available and kaggle CLI failed"
                    print(f"Error: {error_msg}")
                    return {
                        "success": False,
                        "error": f"Download failed: {error_msg}. Install kagglehub: pip install kagglehub"
                    }
                except Exception as kagglehub_error:
                    error_msg = f"Both kaggle CLI and kagglehub failed. CLI error: {result.stderr.strip() if result.stderr else f'Command failed with return code {result.returncode}'}. Kagglehub error: {str(kagglehub_error)}"
                    print(f"Error: {error_msg}")
                    return {
                        "success": False,
                        "error": error_msg
                    }
                
        except subprocess.TimeoutExpired:
            error_msg = "Download timeout (3600s exceeded)"
            print(f"Error: {error_msg}")
            return {"success": False, "error": error_msg}
        except Exception as e:
            error_msg = f"Download error: {str(e)}"
            print(f"Error: {error_msg}")
            return {"success": False, "error": error_msg}
    
    def get_competition_names(self) -> Dict[str, Any]:
        """
        Extract all competition names and basic information.
        
        Returns:
            Dict with competition names and basic stats
        """
        try:
            names = []
            total_size = 0
            usability_ratings = []
            
            for comp in self.competitions_data:
                name = comp.get("name", "Unknown")
                title = comp.get("title", "No title")
                usability = comp.get("usability_rating", 0.0)
                size_str = comp.get("size", "0 KB")
                size_kb = comp.get("size_kb", 0.0)
                
                names.append({
                    "name": name,
                    "title": title,
                    "usability_rating": usability,
                    "size": size_str
                })
                
                # Use parsed size
                total_size += size_kb
                
                if isinstance(usability, (int, float)):
                    usability_ratings.append(usability)
            
            avg_usability = sum(usability_ratings) / len(usability_ratings) if usability_ratings else 0
            
            return {
                "success": True,
                "total_competitions": len(names),
                "competitions": names,
                "statistics": {
                    "total_size_kb": total_size,
                    "average_usability": round(avg_usability, 3),
                    "usability_range": [min(usability_ratings), max(usability_ratings)] if usability_ratings else [0, 0]
                }
            }
            
        except Exception as e:
            return {"success": False, "error": f"Error extracting competition names: {str(e)}"}
    
    def sort_competitions_by_usability(self, ascending: bool = False) -> Dict[str, Any]:
        """
        Sort competitions by usability rating.
        
        Args:
            ascending: Sort in ascending order if True, descending if False
            
        Returns:
            Dict with sorted competitions and metadata
        """
        try:
            cache_key = f"usability_{ascending}"
            if cache_key in self._sorted_cache:
                return self._sorted_cache[cache_key]
            
            # Filter competitions with valid usability ratings
            valid_competitions = [
                comp for comp in self.competitions_data
                if isinstance(comp.get("usability_rating"), (int, float))
            ]
            
            # Sort by usability rating
            sorted_competitions = sorted(
                valid_competitions,
                key=lambda x: x.get("usability_rating", 0),
                reverse=not ascending
            )
            
            result = {
                "success": True,
                "sort_order": "ascending" if ascending else "descending",
                "total_competitions": len(sorted_competitions),
                "competitions": sorted_competitions,
                "statistics": {
                    "best_usability": sorted_competitions[0].get("usability_rating") if sorted_competitions else 0,
                    "worst_usability": sorted_competitions[-1].get("usability_rating") if sorted_competitions else 0,
                    "filtered_out": len(self.competitions_data) - len(valid_competitions)
                }
            }
            
            # Cache the result
            self._sorted_cache[cache_key] = result
            return result
            
        except Exception as e:
            return {"success": False, "error": f"Error sorting competitions: {str(e)}"}
    
    def get_top_n_competitions(self, n: int = 10, sort_by: str = "usability", 
                              ascending: bool = False) -> Dict[str, Any]:
        """
        Get top N competitions based on specified criteria.
        
        Args:
            n: Number of competitions to return
            sort_by: Sort criteria (usability, download_count, size)
            ascending: Sort order
            
        Returns:
            Dict with top N competitions
        """
        try:
            if n <= 0:
                return {"success": False, "error": "N must be a positive integer"}
            
            # Sort competitions based on criteria
            if sort_by == "usability":
                sorted_result = self.sort_competitions_by_usability(ascending)
                if not sorted_result["success"]:
                    return sorted_result
                competitions = sorted_result["competitions"]
            elif sort_by == "download_count":
                competitions = sorted(
                    self.competitions_data,
                    key=lambda x: x.get("download_count", 0),
                    reverse=not ascending
                )
            elif sort_by == "size":
                competitions = sorted(
                    self.competitions_data,
                    key=lambda x: x.get("size_kb", 0.0),
                    reverse=not ascending
                )
            else:
                return {"success": False, "error": f"Invalid sort criteria: {sort_by}"}
            
            # Get top N
            top_n = competitions[:min(n, len(competitions))]
            
            return {
                "success": True,
                "requested_count": n,
                "returned_count": len(top_n),
                "sort_by": sort_by,
                "sort_order": "ascending" if ascending else "descending",
                "competitions": top_n
            }
            
        except Exception as e:
            return {"success": False, "error": f"Error getting top N competitions: {str(e)}"}
    
    def get_competition_by_index(self, index: int, sort_by: str = "usability", 
                                ascending: bool = False) -> Dict[str, Any]:
        """
        Get competition at specific index from sorted list.
        
        Args:
            index: Zero-based index of competition
            sort_by: Sort criteria before indexing
            ascending: Sort order
            
        Returns:
            Dict with competition details
        """
        try:
            if index < 0:
                return {"success": False, "error": "Index must be non-negative"}
            
            # Get sorted competitions
            top_n_result = self.get_top_n_competitions(
                n=index + 1, sort_by=sort_by, ascending=ascending
            )
            
            if not top_n_result["success"]:
                return top_n_result
            
            competitions = top_n_result["competitions"]
            
            if index >= len(competitions):
                return {
                    "success": False,
                    "error": f"Index {index} out of range (max: {len(competitions) - 1})"
                }
            
            competition = competitions[index]
            detailed_info = self.extract_competition_info(competition)
            
            return {
                "success": True,
                "index": index,
                "sort_by": sort_by,
                "sort_order": "ascending" if ascending else "descending",
                "competition": detailed_info
            }
            
        except Exception as e:
            return {"success": False, "error": f"Error getting competition by index: {str(e)}"}
    
    def download_competition_by_rank(self, rank: int, sort_by: str = "usability", 
                                   ascending: bool = False, target_directory: Optional[str] = None) -> Dict[str, Any]:
        """
        Download competition dataset by its rank in sorted list.
        
        Args:
            rank: Zero-based rank/index of competition to download
            sort_by: Sort criteria (usability, download_count, size)
            ascending: Sort order (False for descending, True for ascending)
            target_directory: Target directory for download (optional)
            
        Returns:
            Dict with download status and competition details
        """
        try:
            # Get competition at specified rank
            competition_result = self.get_competition_by_index(rank, sort_by, ascending)
            
            if not competition_result["success"]:
                return {
                    "success": False,
                    "error": f"Failed to get competition at rank {rank}: {competition_result['error']}"
                }
            
            competition_info = competition_result["competition"]
            competition_name = competition_info.name
            
            # Prepare download directory
            if target_directory:
                download_dir = Path(target_directory)
            else:
                # Create a subdirectory named after the competition
                os.makedirs(target_directory, exist_ok=True)
                download_dir = Path(target_directory)
            
            # Download the competition data
            download_result = self.download_competition_data(competition_name, str(download_dir))
            
            # Combine results
            return {
                "success": download_result["success"],
                "rank": rank,
                "sort_by": sort_by,
                "sort_order": "ascending" if ascending else "descending",
                "competition_info": {
                    "name": competition_info.name,
                    "title": competition_info.title,
                    "usability_rating": competition_info.usability_rating,
                    "size": competition_info.size,
                    "url": competition_info.url
                },
                "download_result": download_result,
                "download_directory": download_result.get("download_directory", str(download_dir)) if download_result["success"] else None,
                "error": download_result.get("error") if not download_result["success"] else None
            }
            
        except Exception as e:
            return {"success": False, "error": f"Error downloading competition by rank: {str(e)}"}
    
    def extract_competition_info(self, competition: Dict[str, Any]) -> CompetitionInfo:
        """
        Extract detailed competition information into structured format.
        
        Args:
            competition: Raw competition data dict
            
        Returns:
            CompetitionInfo object with all details
        """
        try:
            return CompetitionInfo(
                title=competition.get("title", "No title"),
                url=competition.get("url", ""),
                name=competition.get("name", "Unknown"),
                subtitle=competition.get("subtitle", "No subtitle"),
                size=competition.get("size", "Unknown size"),
                tags=competition.get("tags", []),
                usability_rating=competition.get("usability_rating", 0.0),
                detailed_description=competition.get("detailed_description", "No description available"),
                license_info=competition.get("license_info", {}),
                download_count=competition.get("download_count", 0),
                created_date=competition.get("created_date", ""),
                last_updated=competition.get("last_updated", ""),
                owner=competition.get("owner", "Unknown"),
                features=competition.get("features", []),
                usage_scenarios=competition.get("usage_scenarios", [])
            )
        except Exception as e:
            # Return minimal info if extraction fails
            return CompetitionInfo(
                title=str(competition.get("title", "Error extracting title")),
                url=str(competition.get("url", "")),
                name=str(competition.get("name", "Error")),
                subtitle=f"Error extracting info: {str(e)}",
                size="Unknown",
                tags=[],
                usability_rating=0.0,
                detailed_description=f"Error extracting description: {str(e)}",
                license_info={}
            )
    
    def get_detailed_competition_info(self, competition_identifier: Union[str, int], 
                                    sort_by: str = "usability", ascending: bool = False) -> Dict[str, Any]:
        """
        Get detailed information for a specific competition.
        
        Args:
            competition_identifier: Competition name or index
            sort_by: Sort criteria if using index
            ascending: Sort order if using index
            
        Returns:
            Dict with detailed competition information
        """
        try:
            if isinstance(competition_identifier, int):
                # Get by index
                result = self.get_competition_by_index(competition_identifier, sort_by, ascending)
                if not result["success"]:
                    return result
                return {
                    "success": True,
                    "method": "index",
                    "identifier": competition_identifier,
                    "competition_info": result["competition"].__dict__
                }
            
            elif isinstance(competition_identifier, str):
                # Find by name
                for comp in self.competitions_data:
                    if (comp.get("name", "").lower() == competition_identifier.lower() or
                        competition_identifier.lower() in comp.get("name", "").lower()):
                        info = self.extract_competition_info(comp)
                        return {
                            "success": True,
                            "method": "name_search",
                            "identifier": competition_identifier,
                            "competition_info": info.__dict__
                        }
                
                return {
                    "success": False,
                    "error": f"Competition not found: {competition_identifier}"
                }
            
            else:
                return {
                    "success": False,
                    "error": "Identifier must be string (name) or int (index)"
                }
                
        except Exception as e:
            return {"success": False, "error": f"Error getting detailed info: {str(e)}"}
    
    def search_competitions_by_tag(self, tag_keyword: str, ascending: bool = False, 
                                  exact_match: bool = False) -> Dict[str, Any]:
        """
        Search competitions by tag keyword and sort by usability rating.
        
        Args:
            tag_keyword: Keyword to search in tags (e.g., "vision")
            ascending: Sort order for usability rating (False for descending, True for ascending)
            exact_match: If True, tag must match exactly; if False, tag must contain the keyword
            
        Returns:
            Dict with matching competitions sorted by usability rating
        """
        try:
            tag_keyword_lower = tag_keyword.lower()
            matched_competitions = []
            
            for comp in self.competitions_data:
                tags = comp.get("tags", [])
                if not isinstance(tags, list):
                    continue
                
                # Check if any tag contains the keyword
                tag_matched = False
                matched_tags = []
                
                for tag in tags:
                    if isinstance(tag, str):
                        tag_lower = tag.lower()
                        if exact_match:
                            if tag_lower == tag_keyword_lower:
                                tag_matched = True
                                matched_tags.append(tag)
                        else:
                            if tag_keyword_lower in tag_lower:
                                tag_matched = True
                                matched_tags.append(tag)
                
                if tag_matched:
                    # Add matched tags info to competition data
                    comp_with_match_info = comp.copy()
                    comp_with_match_info["matched_tags"] = matched_tags
                    matched_competitions.append(comp_with_match_info)
            
            # Filter competitions with valid usability ratings for sorting
            valid_competitions = [
                comp for comp in matched_competitions
                if isinstance(comp.get("usability_rating"), (int, float))
            ]
            
            # Competitions without usability ratings
            invalid_competitions = [
                comp for comp in matched_competitions
                if not isinstance(comp.get("usability_rating"), (int, float))
            ]
            
            # Sort valid competitions by usability rating
            sorted_valid_competitions = sorted(
                valid_competitions,
                key=lambda x: x.get("usability_rating", 0),
                reverse=not ascending
            )
            
            # Combine sorted valid competitions with invalid ones at the end
            final_competitions = sorted_valid_competitions + invalid_competitions
            
            # Statistics
            usability_ratings = [
                comp.get("usability_rating", 0) 
                for comp in valid_competitions
            ]
            
            return {
                "success": True,
                "tag_keyword": tag_keyword,
                "exact_match": exact_match,
                "sort_order": "ascending" if ascending else "descending",
                "total_matches": len(matched_competitions),
                "competitions_with_usability": len(valid_competitions),
                "competitions_without_usability": len(invalid_competitions),
                "competitions": final_competitions,
                "statistics": {
                    "best_usability": max(usability_ratings) if usability_ratings else None,
                    "worst_usability": min(usability_ratings) if usability_ratings else None,
                    "average_usability": round(sum(usability_ratings) / len(usability_ratings), 3) if usability_ratings else None,
                    "unique_matched_tags": list(set([
                        tag for comp in matched_competitions 
                        for tag in comp.get("matched_tags", [])
                    ]))
                }
            }
            
        except Exception as e:
            return {"success": False, "error": f"Error searching competitions by tag: {str(e)}"}
    
    def search_competitions(self, query: str, limit: int = 10, 
                          search_fields: List[str] = None) -> Dict[str, Any]:
        """
        Search competitions by query string.
        
        Args:
            query: Search query
            limit: Maximum results to return
            search_fields: Fields to search in (default: name, title, tags)
            
        Returns:
            Dict with search results
        """
        try:
            if search_fields is None:
                search_fields = ["name", "title", "tags", "subtitle"]
            
            query_lower = query.lower()
            matches = []
            
            for comp in self.competitions_data:
                match_score = 0
                match_details = []
                
                for field in search_fields:
                    field_value = comp.get(field, "")
                    
                    if isinstance(field_value, str) and query_lower in field_value.lower():
                        match_score += 10
                        match_details.append(f"Found in {field}")
                    elif isinstance(field_value, list):
                        for item in field_value:
                            if isinstance(item, str) and query_lower in item.lower():
                                match_score += 5
                                match_details.append(f"Found in {field}")
                                break
                
                if match_score > 0:
                    matches.append({
                        "competition": comp,
                        "match_score": match_score,
                        "match_details": match_details
                    })
            
            # Sort by match score
            matches.sort(key=lambda x: x["match_score"], reverse=True)
            
            return {
                "success": True,
                "query": query,
                "total_matches": len(matches),
                "returned_count": min(limit, len(matches)),
                "results": [m["competition"] for m in matches[:limit]],
                "match_details": [m["match_details"] for m in matches[:limit]]
            }
            
        except Exception as e:
            return {"success": False, "error": f"Search error: {str(e)}"}
    
    def get_statistics(self) -> Dict[str, Any]:
        """Get comprehensive statistics about the competition dataset."""
        try:
            total_competitions = len(self.competitions_data)
            usability_ratings = [
                comp.get("usability_rating", 0) 
                for comp in self.competitions_data 
                if isinstance(comp.get("usability_rating"), (int, float))
            ]
            
            download_counts = [
                comp.get("download_count", 0)
                for comp in self.competitions_data
                if isinstance(comp.get("download_count"), (int, float))
            ]
            
            sizes = [
                comp.get("size_kb", 0.0)
                for comp in self.competitions_data
            ]
            
            tags_count = {}
            for comp in self.competitions_data:
                for tag in comp.get("tags", []):
                    tags_count[tag] = tags_count.get(tag, 0) + 1
            
            return {
                "success": True,
                "total_competitions": total_competitions,
                "usability_stats": {
                    "count": len(usability_ratings),
                    "average": round(sum(usability_ratings) / len(usability_ratings), 3) if usability_ratings else 0,
                    "min": min(usability_ratings) if usability_ratings else 0,
                    "max": max(usability_ratings) if usability_ratings else 0
                },
                "download_stats": {
                    "count": len(download_counts),
                    "total": sum(download_counts),
                    "average": round(sum(download_counts) / len(download_counts), 2) if download_counts else 0,
                    "max": max(download_counts) if download_counts else 0
                },
                "size_stats": {
                    "total_kb": round(sum(sizes), 2),
                    "average_kb": round(sum(sizes) / len(sizes), 2) if sizes else 0,
                    "largest_kb": max(sizes) if sizes else 0
                },
                "top_tags": sorted(tags_count.items(), key=lambda x: x[1], reverse=True)[:10]
            }
            
        except Exception as e:
            return {"success": False, "error": f"Statistics error: {str(e)}"}


# def main():
#     """Command line interface for WorkflowTools."""
#     if len(sys.argv) < 2:
#         print("Usage: python workflow_tools.py <action> [args...]")
#         print("Actions: names, sort, top, get, search, stats")
#         return
    
#     tools = WorkflowTools()
#     action = sys.argv[1]
    
#     if action == "names":
#         result = tools.get_competition_names()
#         print(json.dumps(result, indent=2))
    
#     elif action == "sort":
#         ascending = len(sys.argv) > 2 and sys.argv[2].lower() == "asc"
#         result = tools.sort_competitions_by_usability(ascending)
#         print(json.dumps({k: v for k, v in result.items() if k != "competitions"}, indent=2))
    
#     elif action == "top" and len(sys.argv) >= 3:
#         n = int(sys.argv[2])
#         sort_by = sys.argv[3] if len(sys.argv) > 3 else "usability"
#         result = tools.get_top_n_competitions(n, sort_by)
#         print(json.dumps({k: v for k, v in result.items() if k != "competitions"}, indent=2))
    
#     elif action == "get" and len(sys.argv) >= 3:
#         try:
#             identifier = int(sys.argv[2])
#         except ValueError:
#             identifier = sys.argv[2]
#         result = tools.get_detailed_competition_info(identifier)
#         print(json.dumps(result, indent=2))
    
#     elif action == "search" and len(sys.argv) >= 3:
#         query = sys.argv[2]
#         limit = int(sys.argv[3]) if len(sys.argv) > 3 else 10
#         result = tools.search_competitions(query, limit)
#         print(json.dumps({k: v for k, v in result.items() if k != "results"}, indent=2))
    
#     elif action == "stats":
#         result = tools.get_statistics()
#         print(json.dumps(result, indent=2))
    
#     else:
#         print("Invalid action or insufficient arguments")


# if __name__ == "__main__":
#     main()