# data_loader.py
# Utilities to load text sessions, audio files, and create PyTorch DataLoader

import json
import csv
from pathlib import Path
from typing import List, Dict, Optional, Iterable, Any
import torch
from torch.utils.data import Dataset, DataLoader
import os

try:
    import torchaudio
except Exception:
    torchaudio = None
    # torchaudio optional; fallback loaders will be used if not installed

def load_text_sessions(path: str) -> List[Dict]:
    """
    Load conversational sessions from a file.
    Support .jsonl (one json per line), .json (list), .csv, .txt (one session per line).
    Each session is returned as a dict (e.g. {'id':..., 'utterances': [...], 'meta': {...}}).
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"{path} not found")

    if p.suffix == ".jsonl":
        sessions = []
        with p.open("r", encoding="utf8") as f:
            for line in f:
                sessions.append(json.loads(line))
        return sessions
    if p.suffix == ".json":
        with p.open("r", encoding="utf8") as f:
            data = json.load(f)
        # if top-level is list, return it
        if isinstance(data, list):
            return data
        # else try convert to list of sessions
        return [data]
    if p.suffix == ".csv":
        sessions = []
        with p.open("r", encoding="utf8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                sessions.append(dict(row))
        return sessions
    if p.suffix == ".txt":
        sessions = []
        with p.open("r", encoding="utf8") as f:
            for i, line in enumerate(f):
                sessions.append({"id": i, "text": line.strip()})
        return sessions
    raise ValueError("Unsupported file type for load_text_sessions: " + p.suffix)

def load_audio(path: str, sr: Optional[int] = None) -> torch.Tensor:
    """
    Load audio file and return waveform tensor (1, samples) as float32, sampled at sr if provided.
    Uses torchaudio if available; otherwise tries librosa.
    """
    sr = sr or 16000
    if torchaudio is not None:
        waveform, orig_sr = torchaudio.load(path)
        # convert to mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if orig_sr != sr:
            waveform = torchaudio.functional.resample(waveform, orig_sr, sr)
        return waveform.float()
    # fallback: librosa
    try:
        import librosa
        y, orig_sr = librosa.load(path, sr=sr, mono=True)
        import numpy as np
        return torch.from_numpy(y[np.newaxis, :].astype("float32"))
    except Exception as e:
        raise RuntimeError("No audio backend available (torchaudio or librosa).") from e

class ConversationDataset(Dataset):
    """
    Minimal Dataset that yields dicts: {'text': str, 'audio_path': Optional[str], 'meta': dict}
    The user can pass preloaded sessions (list of dicts) or a path to sessions file.
    """
    def __init__(self, sessions: Iterable[Dict]):
        self.sessions = list(sessions)

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        item = self.sessions[idx]
        # standardize keys
        return {
            "id": item.get("id", idx),
            "text": item.get("text") or item.get("utterances") or "",
            "audio_path": item.get("audio_path"),
            "meta": item.get("meta", {}),
        }

def create_dataloader(sessions: Iterable[Dict], batch_size: int = 8, shuffle: bool = False, num_workers: int = 0):
    """
    Create a PyTorch DataLoader from sessions.
    Collation is minimal; user can replace collate_fn when needed.
    """
    ds = ConversationDataset(sessions)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=lambda batch: batch)
