#!/usr/bin/env python3
"""
Audio sampling script for CREMA-D dataset
Samples audio files based on emotion labels from CSV file
"""

import argparse
import pandas as pd
import os
import shutil
import random
from pathlib import Path


def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description="Sample audio files by emotion from CREMA-D dataset"
    )
    parser.add_argument(
        "--emotion",
        type=str,
        required=True,
        choices=["fearful", "happy", "sad", "angry", "disgust", "neutral"],
        help="Target emotion to sample",
    )
    parser.add_argument(
        "--csv_file",
        type=str,
        required=True,
        help="Path to the CSV file containing emotion labels",
    )
    parser.add_argument(
        "--wav_root",
        type=str,
        required=True,
        help="Root directory containing wav files",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help="Output directory for sampled files",
    )
    parser.add_argument(
        "--sample_num",
        type=int,
        default=1000,
        help="Number of samples to extract (default: 1000)",
    )

    return parser.parse_args()


def get_emotion_mapping():
    """Return mapping from emotion names to CSV codes"""
    return {
        "fearful": "F",
        "happy": "H",
        "sad": "S",
        "angry": "A",
        "disgust": "D",
        "neutral": "N",
    }


def main():
    """Main function"""
    args = parse_arguments()

    # Emotion mapping
    emotion_mapping = get_emotion_mapping()
    target_emotion_code = emotion_mapping[args.emotion]

    print(f"Sampling {args.emotion} (code: {target_emotion_code}) audio files...")

    # Read CSV file
    print(f"Reading CSV file: {args.csv_file}")
    try:
        df = pd.read_csv(args.csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    # Filter by emotion
    print(f"Filtering by emotion code: {target_emotion_code}")
    emotion_df = df[df["respEmo"] == target_emotion_code]

    if len(emotion_df) == 0:
        print(f"No samples found for emotion: {args.emotion}")
        return

    print(f"Found {len(emotion_df)} samples for emotion: {args.emotion}")

    # Sample specified number of records
    sample_size = min(args.sample_num, len(emotion_df))
    sampled_df = emotion_df.sample(n=sample_size, random_state=42)

    print(f"Sampling {sample_size} files...")

    # Create output directory
    output_emotion_dir = os.path.join(args.output_dir, args.emotion)
    os.makedirs(output_emotion_dir, exist_ok=True)

    # Prepare output CSV data
    output_data = []
    copied_count = 0

    # Process each sampled record
    for idx, row in sampled_df.iterrows():
        clip_name = row["clipName"]
        wav_filename = f"{clip_name}.wav"
        source_path = os.path.join(args.wav_root, wav_filename)
        target_path = os.path.join(output_emotion_dir, wav_filename)

        # Check if source file exists
        if os.path.exists(source_path):
            try:
                # Copy audio file
                shutil.copy2(source_path, target_path)
                copied_count += 1

                # Add to output CSV data
                output_data.append(
                    {
                        "dataset_name": "CREMA-D",
                        "wav_filename": wav_filename,
                        "emotion_label": args.emotion,
                        "gender": "unknown",
                        "speaker_id": "unknown",
                    }
                )

            except Exception as e:
                print(f"Error copying {source_path}: {e}")
        else:
            print(f"Warning: Source file not found: {source_path}")

    # Save output CSV
    output_csv_path = os.path.join(args.output_dir, f"{args.emotion}.csv")
    output_df = pd.DataFrame(output_data)
    output_df.to_csv(output_csv_path, index=False)

    print(f"Sampling completed!")
    print(f"Copied {copied_count} audio files to {output_emotion_dir}")
    print(f"Saved metadata to {output_csv_path}")


if __name__ == "__main__":
    main()
