#!/usr/bin/env python3
"""
Emotion Manifest Preprocessing Script
Maps raw emotion strings to standardized natural language labels (11 classes).

Input: manifest.jsonl.gz (with raw emotion strings in supervision.custom.emotion)
Output: normalized_manifest.jsonl.gz (with standardized emotion labels)
"""

import json
import gzip
import logging
import argparse
import os
from typing import Dict, List


def read_jsonl_gz(file_path: str):
    with gzip.open(file_path, 'rt', encoding='utf-8', errors='ignore') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if line:
                try:
                    yield json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"JSON decode error at line {line_num}: {e}")
                    print(f"Line content: {line[:100]}...")
                    continue


def write_jsonl_gz(data_list: List[Dict], file_path: str):
    with gzip.open(file_path, 'wt', encoding='utf-8') as f:
        for item in data_list:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')


def normalize_emotion_manifest(input_path: str, output_path: str):
    """
    Process the emotion manifest file and normalize the emotion field.

    Args:
        input_path: Path to the input manifest file (.jsonl.gz)
        output_path: Path to the output manifest file (.jsonl.gz)
    """
    # Define emotion mapping rules
    # Add key if needed, but DO NOT create new value! 
    emotion_mapping = {
        # Angry
        'ang': 'angry',
        'angry': 'angry',
        'anger': 'angry',

        # Disgust
        'dis': 'disgust',
        'disgust': 'disgust',

        # Fear
        'fea': 'fear',
        'fear': 'fear',
        'fearful': 'fear',

        # Happy
        'hap': 'happy',
        'happy': 'happy',
        'joy': 'happy',

        # Sad
        'sad': 'sad',
        'sadness': 'sad',

        # Neutral
        'neu': 'neutral',
        'neutral': 'neutral',

        # Calm
        'calm': 'calm', 

        # Surprised
        'surprise': 'surprised',
        'surprised': 'surprised',
        'pleasantsurprised': 'surprised',
        'pleasantsurprise': 'surprised',
        'sur': 'surprised',

        # Excited
        'excited': 'excited',
        'exc': 'excited',

        # Frustrated
        'frustrated': 'frustrated',
        'fru': 'frustrated',

        # Other / Unknown
        'other': 'other',
        'unknown': 'other',
        'oth': 'other'
    }

    
    print(f"Processing manifest: {input_path}")
    print(f"Output manifest: {output_path}")
    print(f"Available emotion mappings: {list(set(emotion_mapping.values()))}")
    
    processed_count = 0
    error_count = 0
    mapping_stats = {'angry': 0, 'disgust': 0, 'fear': 0, 'happy': 0, 'neutral': 0, 'sad': 0, 'frustrated': 0, 'excited': 0, 'surprised': 0, 'calm': 0, 'other': 0}
    unknown_emotions = set()
    
    processed_data = []
    
    try:
        for line_num, cut_data in enumerate(read_jsonl_gz(input_path), 1):
            try:
                if 'supervisions' in cut_data:
                    for supervision in cut_data['supervisions']:
                        # check supervision.custom.emotion field
                        if 'custom' in supervision and 'emotion' in supervision['custom']:
                            original_emotion = supervision['custom']['emotion']
                            
                            # mapping
                            normalized = emotion_mapping.get(original_emotion.lower())
                            if normalized is None:
                                print(f"Warning: Unknown emotion '{original_emotion}' at line {line_num}")
                                unknown_emotions.add(original_emotion)
                                normalized = "OTH"  # otherwises, map to "other"
                            
                            supervision['custom']['emotion'] = normalized
                            mapping_stats[normalized] += 1
                
                processed_data.append(cut_data)
                processed_count += 1
                
                if processed_count % 1000 == 0:
                    print(f"Processed {processed_count} records...")
                
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                error_count += 1
        
        write_jsonl_gz(processed_data, output_path)
        
        # Print stats
        print(f"\nProcessing completed successfully!")
        print(f"Total processed: {processed_count}")
        print(f"Errors: {error_count}")
        print(f"Mapping statistics:")
        for label, count in mapping_stats.items():
            if count > 0:
                print(f"  {label}: {count}")
        
        if unknown_emotions:
            print(f"\nUnknown emotions found: {sorted(unknown_emotions)}")
            print(f"These were mapped to 'OTH'")
        
        return True
        
    except Exception as e:
        print(f"Failed to process file: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Normalize emotion labels in manifest files")
    parser.add_argument("input", help="Input manifest file (.jsonl.gz)")
    parser.add_argument("output", help="Output manifest file (.jsonl.gz)")
    
    args = parser.parse_args()
    
    if not os.path.exists(args.input):
        print(f"Error: Input file does not exist: {args.input}")
        return 1
    
    try:
        success = normalize_emotion_manifest(args.input, args.output)
        if success:
            print("\nEmotion manifest preprocessing completed successfully!")
            return 0
        else:
            print("\nEmotion manifest preprocessing failed!")
            return 1
        
    except Exception as e:
        print(f"Unexpected error: {e}")
        return 1


if __name__ == "__main__":
    exit(main()) 