"""
BMD-HS Dataset Preparation Script

This module provides utilities for processing the BMD-HS
(Bangladesh Medical Data - Heart Sound) dataset.

Dataset Information:
--------------------
Repository: https://github.com/sani002/BMD-HS-Dataset
Description: Heart sound recordings from patients with various valvular heart diseases

The dataset contains PCG (Phonocardiogram) recordings with the following labels:
- AS: Aortic Stenosis
- AR: Aortic Regurgitation
- MR: Mitral Regurgitation
- MS: Mitral Stenosis
- N: Normal

Usage:
------
Step 1: Download the dataset manually from GitHub
    Visit: https://github.com/sani002/BMD-HS-Dataset
    Download all .wav files and place them directly in: <download_dir>/

Step 2: Process and organize files using file_link_table.csv
    python buet.py --download_dir /path/to/data/bmdhs --processed_dir /path/to/data/bmdhs_processed

Complete Example:
-----------------
    # Set your data directory
    DATA_DIR="/path/to/your/data"

    # 1. Manually download .wav files from GitHub:
    #    https://github.com/sani002/BMD-HS-Dataset
    #    Place all .wav files directly in: ${DATA_DIR}/bmdhs/

    # 2. Process and organize files
    python src/prep/dataset/buet.py \\
        --download_dir ${DATA_DIR}/bmdhs \\
        --processed_dir ${DATA_DIR}/bmdhs_processed

Output Structure:
-----------------
After processing, your data directory will look like:

    bmdhs/
    ├── MD_001_sup_Mit.wav
    ├── MD_001_sup_Tri.wav
    └── ...

    bmdhs_processed/
    ├── 00001.wav
    ├── 00002.wav
    ├── ...
    └── metadata.csv

The metadata.csv file contains:
- Filename mapping (original -> renamed)
- Patient information (ID, age, gender, etc.)
- Disease labels (AS, AR, MR, MS, N)
- Train/validation/test splits (split_0 through split_4 for 5-fold CV)

Notes:
------
- Dataset must be downloaded manually from GitHub
- Total dataset size: Several GB
- Processing time: A few minutes depending on system performance
"""

import argparse
import subprocess
import sys
from pathlib import Path


def run_process(download_dir: str, processed_dir: str, skip_existing: bool = True, verify: bool = True):
    """
    Run the processing script.

    Args:
        download_dir: Directory containing downloaded raw files
        processed_dir: Directory where processed files will be saved
        skip_existing: Skip if destination file already exists
        verify: Verify file integrity after copying
    """
    # Get the path to process_bmdhs.py (same directory as this script)
    script_dir = Path(__file__).parent.resolve()
    process_script = script_dir / "process_bmdhs.py"

    cmd = [
        sys.executable,
        str(process_script),
        "--download_dir", download_dir,
        "--processed_dir", processed_dir,
        # csv_file will use default value from process_bmdhs.py
    ]
    if skip_existing:
        cmd.append("--skip_existing")
    if verify:
        cmd.append("--verify")

    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd)
    return result.returncode


def main():
    """Main function for BMD-HS dataset preparation."""
    parser = argparse.ArgumentParser(
        description="BMD-HS Dataset Processing Pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument(
        "--download_dir",
        type=str,
        required=True,
        help="Directory containing downloaded .wav files (files should be directly in this directory)"
    )
    parser.add_argument(
        "--processed_dir",
        type=str,
        required=True,
        help="Directory where processed files will be saved"
    )

    args = parser.parse_args()

    # Check if download directory exists
    download_dir = Path(args.download_dir)
    if not download_dir.exists():
        print("=" * 60)
        print("ERROR: Download directory not found")
        print("=" * 60)
        print(f"\nExpected location: {download_dir}")
        print("\nPlease download the BMD-HS dataset manually:")
        print("1. Visit: https://github.com/sani002/BMD-HS-Dataset")
        print("2. Download all .wav files")
        print(f"3. Place them directly in: {download_dir}/")
        print("\nThen run this script again.")
        print("=" * 60)
        return 1

    # Check if download directory has files
    wav_files = list(download_dir.glob("*.wav"))
    if not wav_files:
        print("=" * 60)
        print("ERROR: No .wav files found in download directory")
        print("=" * 60)
        print(f"\nDirectory checked: {download_dir}")
        print("\nPlease download the BMD-HS dataset manually:")
        print("1. Visit: https://github.com/sani002/BMD-HS-Dataset")
        print("2. Download all .wav files")
        print(f"3. Place them directly in: {download_dir}/")
        print("\nThen run this script again.")
        print("=" * 60)
        return 1

    print("=" * 60)
    print("BMD-HS Dataset Processing")
    print("=" * 60)
    print(f"Download directory: {download_dir}")
    print(f"Processed directory: {args.processed_dir}")
    print(f"Found {len(wav_files)} .wav files")
    print("=" * 60)
    print()

    # Process files
    ret = run_process(args.download_dir, args.processed_dir, skip_existing=True, verify=True)
    if ret != 0:
        print("\nProcessing failed!")
        return ret

    print("\n" + "=" * 60)
    print("BMD-HS Dataset processing completed successfully!")
    print("=" * 60)
    print(f"\nProcessed data location: {args.processed_dir}")
    print(f"Metadata file: {args.processed_dir}/metadata.csv")

    return 0


if __name__ == "__main__":
    sys.exit(main())
