import os
import sys
import shutil
from pathlib import Path

def create_data_structure():
    # Read the mle-list.txt file
    with open('./mle-list.txt', 'r') as f:
        lines = f.readlines()
    
    # Create data_raw directory if it doesn't exist
    os.makedirs('./MLE-Dojo/data/prepared', exist_ok=True)
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Split A/B format
        parts = line.split('/')
        if len(parts) != 2:
            continue
            
        A, B = parts
        
        # Remove leading dash from B if present
        if B.startswith('-'):
            B = B[1:]
        
        # Create directory D with name B
        D = os.path.join('./MLE-Dojo/data/prepared', B)
        os.makedirs(D, exist_ok=True)
        
        # Create data/ subdirectory
        data_dir = os.path.join(D, 'data')
        os.makedirs(data_dir, exist_ok=True)
        
        # Create data/raw/ subdirectory
        raw_dir = os.path.join(data_dir, 'raw')
        os.makedirs(raw_dir, exist_ok=True)
        
        # Source paths in mledojo/competitions
        source_dir = os.path.join('./MLE-Dojo/mledojo/competitions', B)
        
        # Copy description.txt from info/ folder to data/
        description_src = os.path.join(source_dir, 'info', 'description.txt')
        if os.path.exists(description_src):
            shutil.copy2(description_src, data_dir)
        else:
            print(f"Warning: description.txt not found for {B}")
        
        # Copy metric.py from utils/ folder to data/
        metric_src = os.path.join(source_dir, 'utils', 'metric.py')
        if os.path.exists(metric_src):
            shutil.copy2(metric_src, data_dir)
        else:
            print(f"Warning: metric.py not found for {B}")
        
        # Copy prepare.py from utils/ folder to data/
        prepare_src = os.path.join(source_dir, 'utils', 'prepare.py')
        if os.path.exists(prepare_src):
            shutil.copy2(prepare_src, data_dir)
        else:
            print(f"Warning: prepare.py not found for {B}")
        
        print(f"Created structure for {B}")


def download_datasets():
    """Download datasets from Kaggle based on mle-list.txt"""
    import subprocess
    from pathlib import Path
    
    # Use absolute paths
    mle_list_txt = Path("./mle-list.txt").resolve()
    data_raw_base = Path("./MLE-Dojo/data/prepared").resolve()
    
    # Check if mle-list.txt exists
    if not mle_list_txt.exists():
        print(f"Task list file not found: {mle_list_txt}")
        return
    
    # Check if data_raw base directory exists
    if not data_raw_base.exists():
        print(f"Data raw base directory not found: {data_raw_base}")
        return
    
    # Read task names from mle-list.txt
    with open(mle_list_txt, 'r') as f:
        tasks = [line.strip() for line in f if line.strip()]
    
    successful_downloads = 0
    failed_downloads = 0
    
    for task in tasks:
        try:
            # Parse A/B format
            if '/' not in task:
                print(f"Invalid task format (expected A/B): {task}")
                failed_downloads += 1
                continue
            
            # Split into A and B
            parts = task.split('/', 1)  # Split only on first '/'
            competition_name = task  # A/B
            folder_name = parts[1]   # B
            
            # Process folder name - remove leading dash if present
            if folder_name.startswith('-'):
                folder_name = folder_name[1:]
            
            # Define download directory (added extra data layer) - absolute path
            download_dir = (data_raw_base / folder_name / 'data' / 'raw').resolve()
            
            # Check if destination folder exists
            dest_folder = (data_raw_base / folder_name).resolve()
            if not dest_folder.exists():
                print(f"Warning: Destination folder not found: {dest_folder}")
                failed_downloads += 1
                continue
            
            # Create data/raw directory if it doesn't exist
            download_dir.mkdir(parents=True, exist_ok=True)
            
            print(f"Downloading {competition_name} to {download_dir}")
            
            # Prepare kaggle command
            cmd = ["kaggle", "datasets", "download", competition_name, "-p", str(download_dir), "--unzip"]
            
            # Execute download with timeout
            result = subprocess.run(
                cmd,
                cwd=str(download_dir),
                capture_output=True,
                text=True,
                timeout=3600
            )
            
            if result.returncode == 0:
                print(f"Successfully downloaded {competition_name}")
                successful_downloads += 1
            else:
                print(f"Failed to download {competition_name}: {result.stderr}")
                failed_downloads += 1
                
        except subprocess.TimeoutExpired:
            print(f"Timeout downloading {competition_name}")
            failed_downloads += 1
        except Exception as e:
            print(f"Error downloading {competition_name}: {str(e)}")
            failed_downloads += 1
    
    print(f"\nDownload summary:")
    print(f"  Total tasks: {len(tasks)}")
    print(f"  Successful downloads: {successful_downloads}")
    print(f"  Failed downloads: {failed_downloads}")



def run_prepare_functions(base_dir="./MLE-Dojo/data/prepared"):
    """Run prepare.py functions for each folder in data_raw"""
    
    base_dir = os.path.abspath(base_dir)  # Convert to absolute path
    
    if not os.path.exists(base_dir):
        print(f"Base directory not found: {base_dir}")
        return
    
    # Get all folder names in data/prepared
    folder_names = [name for name in os.listdir(base_dir) 
                   if os.path.isdir(os.path.join(base_dir, name))]
    
    successful_runs = 0
    failed_runs = 0
    
    for folder_name in folder_names:
        # Dataset root directory
        dataset_dir = os.path.join(base_dir, folder_name, "data")
        prepare_py_path = os.path.join(dataset_dir, "prepare.py")
        
        print(f"\nProcessing: {folder_name}")
        print(f"  Looking for prepare.py at: {prepare_py_path}")
        
        if not os.path.exists(prepare_py_path):
            print(f"  No prepare.py found, skipping...")
            continue
        
        # Save original working directory and sys.path
        original_cwd = os.getcwd()
        original_syspath = sys.path.copy()
        
        try:
            # Add prepare.py directory to sys.path
            if dataset_dir not in sys.path:
                sys.path.insert(0, dataset_dir)
            
            # Switch to prepare.py directory
            os.chdir(dataset_dir)
            
            # Dynamically import prepare module
            import importlib.util
            
            # Use unique module name to avoid cache issues
            module_name = f"prepare_module_{folder_name.replace('-', '_')}"
            
            # If module is already in sys.modules, remove it first
            if module_name in sys.modules:
                del sys.modules[module_name]
            
            spec = importlib.util.spec_from_file_location(module_name, prepare_py_path)
            if spec is None:
                print(f"  Failed to create module spec for {folder_name}")
                continue
                
            prepare_module = importlib.util.module_from_spec(spec)
            
            # Add module to sys.modules
            sys.modules[module_name] = prepare_module
            
            # Execute module
            spec.loader.exec_module(prepare_module)
            
            # Check if prepare function exists
            if not hasattr(prepare_module, 'prepare'):
                print(f"  No prepare function found in prepare.py")
                continue
            
            # Set up paths - these paths should be under dataset_dir
            raw_path = Path(dataset_dir) / "raw"
            public_path = Path(dataset_dir) / "public"
            private_path = Path(dataset_dir) / "private"
            
            # Ensure paths are absolute
            raw_path = raw_path.resolve()
            public_path = public_path.resolve()
            private_path = private_path.resolve()
            
            # Delete public/ and private/ directories if they exist
            if public_path.exists():
                shutil.rmtree(public_path)
                print(f"  Deleted existing public/ directory")
            
            if private_path.exists():
                shutil.rmtree(private_path)
                print(f"  Deleted existing private/ directory")
            
            if not raw_path.exists():
                print(f"  No raw/ folder found, skipping...")
                continue
            
            print(f"  Running prepare function...")
            print(f"    Raw path: {raw_path}")
            print(f"    Public path: {public_path}")
            print(f"    Private path: {private_path}")
            
            # Call the prepare function
            prepare_module.prepare(raw_path, public_path, private_path)
            
            # Copy description.txt to public directory if it exists
            description_src = os.path.join(dataset_dir, "description.txt")
            if os.path.exists(description_src):
                description_dst = os.path.join(public_path, "description.txt")
                shutil.copy2(description_src, description_dst)
                print(f"  Copied description.txt to public/")
            else:
                print(f"  No description.txt found in dataset directory")
            
            print(f"  ✓ Successfully completed prepare for: {folder_name}")
            successful_runs += 1
            
        except Exception as e:
            print(f"  ✗ Error running prepare for {folder_name}: {str(e)}")
            import traceback
            traceback.print_exc()
            failed_runs += 1
            
        finally:
            # Restore original working directory and sys.path
            os.chdir(original_cwd)
            sys.path = original_syspath
            
            # Clean up imported module
            if 'module_name' in locals() and module_name in sys.modules:
                del sys.modules[module_name]
    
    print(f"\n{'='*50}")
    print(f"Summary:")
    print(f"  Total folders processed: {len(folder_names)}")
    print(f"  Successful prepare runs: {successful_runs}")
    print(f"  Failed prepare runs: {failed_runs}")
    print(f"{'='*50}")


if __name__ == "__main__":
    create_data_structure()
    download_datasets()
    run_prepare_functions()
