"""
Generate lists of blob_ids to fetch for each bucket.
Creates blob_list_bucket_{bucket}.txt files for use with s5cmd.
"""

from datasets import load_from_disk
from pathlib import Path
from tqdm import tqdm

# Configuration
CACHE_DIR = Path.home() / '.cache' / 'huggingface' / 'datasets'
BUCKETS_TO_PROCESS = [7,8]
MAX_REPOS_PER_BUCKET = None  # None = process all repos

# For debugging: set to small number to test
# MAX_REPOS_PER_BUCKET = 100


def collect_blob_ids(bucket_id: int):
    """
    Collect all blob_ids for Python files in a bucket.
    Returns set of unique blob_ids.
    """
    print(f"\n{'='*60}")
    print(f"Bucket {bucket_id}")
    print(f"{'='*60}")

    # Load dataset
    dataset_path = CACHE_DIR / f"stack-v2-smol-ids-bucket-{bucket_id:02d}"
    if not dataset_path.exists():
        print(f"  ERROR: Dataset not found at {dataset_path}")
        return None

    ds = load_from_disk(str(dataset_path))
    total_repos = len(ds)

    # Determine how many repos to process
    if MAX_REPOS_PER_BUCKET is not None:
        repos_to_process = min(MAX_REPOS_PER_BUCKET, total_repos)
        print(f"  DEBUG MODE: Processing {repos_to_process:,} of {total_repos:,} repos")
        ds = ds.select(range(repos_to_process))
    else:
        repos_to_process = total_repos
        print(f"  Total repos: {repos_to_process:,}")

    # Collect blob_ids
    blob_ids = set()

    print(f"  Collecting blob_ids...")
    for repo in tqdm(ds, desc=f"  Bucket {bucket_id}", unit="repo"):
        # Filter Python files (non-vendor, non-generated)
        python_files = [
            f for f in repo['files']
            if f.get('language') == 'Python'
            and not f.get('is_vendor', False)
            and not f.get('is_generated', False)
        ]

        # Collect blob_ids
        for file_info in python_files:
            blob_ids.add(file_info['blob_id'])

    print(f"  Unique blob_ids: {len(blob_ids):,}")
    return blob_ids


def main():
    print("="*60)
    print("Generate Blob ID Lists for S3 Fetching")
    print("="*60)
    print(f"Buckets: {BUCKETS_TO_PROCESS}")
    if MAX_REPOS_PER_BUCKET is not None:
        print(f"DEBUG MODE: Max {MAX_REPOS_PER_BUCKET:,} repos per bucket")
    else:
        print(f"Processing ALL repos in each bucket")

    all_stats = []

    # Process each bucket
    for bucket_id in BUCKETS_TO_PROCESS:
        blob_ids = collect_blob_ids(bucket_id)

        if blob_ids is None:
            continue

        # Write to file
        output_path = Path(f"blob_list_bucket_{bucket_id}.txt")
        with open(output_path, 'w') as f:
            for blob_id in sorted(blob_ids):
                f.write(f"{blob_id}\n")

        print(f"  Saved to: {output_path}")

        all_stats.append({
            'bucket_id': bucket_id,
            'blob_count': len(blob_ids)
        })

    # Summary
    print(f"\n{'='*60}")
    print(f"Summary:")
    print(f"{'='*60}")
    print(f"{'Bucket':<8} {'Unique Blobs':>15}")
    print(f"{'-'*60}")

    total_blobs = 0
    for stat in all_stats:
        print(f"{stat['bucket_id']:<8} {stat['blob_count']:>15,}")
        total_blobs += stat['blob_count']

    print(f"{'-'*60}")
    print(f"{'Total':<8} {total_blobs:>15,}")

    print(f"\n{'='*60}")
    print(f"Next steps:")
    print(f"{'='*60}")
    print(f"Use s5cmd to fetch blobs. Example for bucket 4:")
    print(f"  mkdir -p blobs_bucket_4")
    print(f"  cat blob_list_bucket_4.txt | while read blob; do")
    print(f"    echo \"s3://softwareheritage/content/$blob blobs_bucket_4/$blob\"")
    print(f"  done > s5cmd_commands_bucket_4.txt")
    print(f"  s5cmd --no-sign-request run s5cmd_commands_bucket_4.txt")


if __name__ == "__main__":
    main()
