from pathlib import Path
import shutil
import csv
import random
from collections import Counter

# Deterministic behavior for any random choices
RANDOM_SEED = 20240123
random.seed(RANDOM_SEED)

LABELS = [
    'air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling',
    'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music'
]

FOLD_FOR_TEST = 10  # use fold10 as test set, folds 1-9 as training set


def _read_metadata(meta_path: Path):
    assert meta_path.is_file(), f"Metadata file not found: {meta_path}"
    rows = []
    with meta_path.open('r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        expected_fields = {'slice_file_name', 'fsID', 'start', 'end', 'salience', 'fold', 'classID', 'class'}
        assert expected_fields.issubset(reader.fieldnames or []), (
            f"Unexpected metadata columns. Found: {reader.fieldnames}")
        for r in reader:
            r['fold'] = int(r['fold'])
            r['classID'] = int(r['classID'])
            r['slice_file_name'] = r['slice_file_name'].strip()
            r['class'] = r['class'].strip()
            rows.append(r)
    assert len(rows) > 0, 'Empty metadata file.'
    return rows


def _build_file_index(raw: Path, rows):
    # Map absolute source path -> (fold, class)
    index = {}
    for r in rows:
        fold = r['fold']
        cls = r['class']
        fname = r['slice_file_name']
        src_path = raw / f"fold{fold}" / fname
        if not src_path.is_file():
            raise FileNotFoundError(f"Audio file missing: {src_path}")
        index[src_path] = (fold, cls)
    # Sanity: ensure all labels in LABELS
    all_classes = {cls for (_, cls) in index.values()}
    assert all_classes.issubset(set(LABELS)), f"Unexpected class names found: {sorted(all_classes - set(LABELS))}"
    return index


def _make_clean_dir(path: Path):
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)


def _generate_ids(sorted_items):
    # Deterministic anonymized ids: ID000000, ID000001, ...
    id_map = {}
    for i, (src_path, (_fold, _cls)) in enumerate(sorted_items):
        anon_id = f"ID{i:06d}"
        id_map[src_path] = anon_id
    return id_map


def _write_csv(path: Path, header, rows):
    with path.open('w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for row in rows:
            writer.writerow(row)


def prepare(raw: Path, public: Path, private: Path):
    """
    Complete preparation process for UrbanSound8K competition data.

    Inputs
    - raw: absolute Path to the raw/ directory containing folds and UrbanSound8K.csv
    - public: absolute Path to the public/ output directory
    - private: absolute Path to the private/ output directory

    Outputs in public/
    - train.csv, test.csv, sample_submission.csv
    - train_audio/, test_audio/
    - description.txt (copied from repository root if available)

    Outputs in private/
    - test_answer.csv
    """
    assert raw.is_absolute() and public.is_absolute() and private.is_absolute(), "Please pass absolute Paths for raw, public, and private."

    # Ensure clean output dirs
    _make_clean_dir(public)
    _make_clean_dir(private)

    # Read metadata from raw
    meta_path = raw / 'UrbanSound8K.csv'
    rows = _read_metadata(meta_path)
    index = _build_file_index(raw, rows)

    # Deterministic sort by (fold, filename) to ensure reproducible IDs
    sorted_items = sorted(index.items(), key=lambda kv: (kv[1][0], kv[0].name))
    id_map = _generate_ids(sorted_items)

    # Split using the fixed designated fold
    train_items = [(sp, meta) for sp, meta in sorted_items if meta[0] != FOLD_FOR_TEST]
    test_items = [(sp, meta) for sp, meta in sorted_items if meta[0] == FOLD_FOR_TEST]

    # Asserts on split
    assert len(train_items) > 0 and len(test_items) > 0, 'Train/test split must both be non-empty.'
    train_classes = {m[1] for _, m in train_items}
    test_classes = {m[1] for _, m in test_items}
    assert test_classes.issubset(train_classes), 'All classes in test must appear in training at least once.'

    # Create audio directories inside public
    train_dir = public / 'train_audio'
    test_dir = public / 'test_audio'
    train_dir.mkdir(parents=True, exist_ok=True)
    test_dir.mkdir(parents=True, exist_ok=True)

    # Build train.csv and copy files with anonymized names
    train_rows = []
    label_counter = Counter()
    for src_path, (_fold, cls) in train_items:
        anon_id = id_map[src_path]
        dst_path = train_dir / f"{anon_id}.wav"
        shutil.copy2(src_path, dst_path)
        train_rows.append([anon_id, cls])
        label_counter[cls] += 1

    # Build test.csv and test_answer.csv and copy files
    test_rows = []
    test_ans_rows = []
    for src_path, (_fold, cls) in test_items:
        anon_id = id_map[src_path]
        dst_path = test_dir / f"{anon_id}.wav"
        shutil.copy2(src_path, dst_path)
        test_rows.append([anon_id])
        test_ans_rows.append([anon_id, cls])

    # Deterministic order of csv rows
    train_rows.sort(key=lambda r: r[0])
    test_rows.sort(key=lambda r: r[0])
    test_ans_rows.sort(key=lambda r: r[0])

    # Write CSVs
    _write_csv(public / 'train.csv', ['id', 'label'], train_rows)
    _write_csv(public / 'test.csv', ['id'], test_rows)
    _write_csv(private / 'test_answer.csv', ['id', 'label'], test_ans_rows)

    # Build sample submission with deterministic but valid labels
    sample_rows = []
    for [anon_id] in test_rows:
        # Deterministic placeholder: choose the first label to avoid randomness
        sample_rows.append([anon_id, LABELS[0]])
    _write_csv(public / 'sample_submission.csv', ['id', 'label'], sample_rows)

    # Copy description.txt into public if exists at repo root alongside this file
    repo_root = Path(__file__).resolve().parent
    desc_src = repo_root / 'description.txt'
    if desc_src.is_file():
        shutil.copy2(desc_src, public / 'description.txt')

    # Post-asserts on outputs
    # 1) No overlap between train and test ids
    train_ids = {r[0] for r in train_rows}
    test_ids = {r[0] for r in test_rows}
    assert train_ids.isdisjoint(test_ids), 'Train and test IDs must be disjoint.'

    # 2) Audio files exist and match csv rows
    for anon_id in train_ids:
        fp = train_dir / f"{anon_id}.wav"
        assert fp.is_file(), f"Missing train audio file: {fp}"
    for anon_id in test_ids:
        fp = test_dir / f"{anon_id}.wav"
        assert fp.is_file(), f"Missing test audio file: {fp}"

    # 3) Labels are within allowed set and not empty
    for _, lbl in train_rows:
        assert lbl in LABELS, f"Invalid train label: {lbl}"
    for _, lbl in test_ans_rows:
        assert lbl in LABELS, f"Invalid test label: {lbl}"

    # 4) Ensure class coverage in train and presence in files
    assert len(train_classes) == len(LABELS), (
        f"Training set must cover all classes. Got {len(train_classes)} vs {len(LABELS)}")

    # 5) Ensure no leakage through filenames (anonymized pattern)
    for d in (train_dir, test_dir):
        for fn in d.iterdir():
            name = fn.name
            assert name.startswith('ID') and name.endswith('.wav') and len(name) == len('ID000000.wav'), (
                f"Unexpected filename format (possible leakage): {name}")

    # 6) Correspondence between test.csv and private/test_answer.csv
    assert {r[0] for r in test_ans_rows} == test_ids, 'Mismatch between test.csv and test_answer.csv ids.'

    # 7) Ensure there are no duplicate ids in any csv
    assert len(train_rows) == len(train_ids), 'Duplicate ids in train.csv'
    assert len(test_rows) == len(test_ids), 'Duplicate ids in test.csv'

    # 8) Ensure public contains required files
    for p in ['train.csv', 'test.csv', 'sample_submission.csv', 'train_audio', 'test_audio']:
        assert (public / p).exists(), f"Missing required public artifact: {public / p}"

    # 9) Ensure private contains test_answer.csv only
    assert (private / 'test_answer.csv').exists(), "private/test_answer.csv must exist"
