import csv
import os

# Gender mapping for Friends characters (original and lowercase/stripped for matching)
female_characters = [
    "Rachel Green",
    "Monica Geller",
    "Phoebe Buffay",
    "Carol Willick",
    "Susan Bunch",
    "Janice Litman",
    "Emily Waltham",
    "Judy Geller",
    "Erica Bing",
    "Ursula Buffay",
    "Amy",
    "Jill Green",
    "Kathy",
    "Julie",
    "Charlie Wheeler",
    "Alice Buffay",
    "Estelle Leonard",
    "Nora Tyler Bing",
    "Bonnie",
    "Cailin",
    "Caroline Duffy",
    "Dina",
    "Joanna",
    "Denise",
    "Leah",
]
male_characters = [
    "Ross Geller",
    "Chandler Bing",
    "Joey Tribbiani",
    "Mike Hannigan",
    "Ben Geller",
    "Jack Geller",
    "Frank Buffay Jr.",
    "Richard Burke",
    "Gunther",
    "Pete Becker",
    "Tag Jones",
    "Paul Stevens",
    "Joshua Burgin",
    "Barry Farber",
    "David the physicist",
    "Bob",
    "Eddie Menuek",
    "Marcel",
    "Fun Bobby",
    "Dr. Oberman",
    "Paolo",
    "Mark",
    "Danny",
    "Duncan",
    "Gary",
    "Steve",
]


# Normalize character names for matching
def normalize_name(name):
    return name.strip().lower()


female_characters_norm = [normalize_name(n) for n in female_characters]
male_characters_norm = [normalize_name(n) for n in male_characters]


def get_gender(speaker_name):
    """Determine gender based on normalized/partial match of speaker name"""
    norm_speaker = normalize_name(speaker_name)
    for f in female_characters_norm:
        if f in norm_speaker or norm_speaker in f:
            return "female"
    for m in male_characters_norm:
        if m in norm_speaker or norm_speaker in m:
            return "male"
    return "unknown"


def generate_meld_csv():
    input_files = [
        "train_sent_emo_clean.csv",
        "dev_sent_emo_clean.csv",
        "test_sent_emo_clean.csv",
    ]
    output_file = "meld.csv"
    header = ["dataset_name", "wav_filename", "emotion_label", "gender", "speaker_id"]

    # Check if all input files exist
    for input_file in input_files:
        if not os.path.exists(input_file):
            print(f"Error: {input_file} not found")
            return

    try:
        with open(output_file, "w", newline="", encoding="utf-8") as outfile:
            writer = csv.writer(outfile)
            writer.writerow(header)
            for input_file in input_files:
                print(f"Processing {input_file}...")
                with open(input_file, "r", encoding="utf-8") as infile:
                    reader = csv.reader(infile)
                    input_header = next(reader)
                    for row in reader:
                        if len(row) != len(input_header):
                            # Skip rows with mismatched field count
                            continue
                        dialogue_id = row[5]
                        utterance_id = row[6]
                        emotion = row[3]
                        speaker = row[2]
                        wav_filename = f"dia{dialogue_id}_utt{utterance_id}.wav"
                        gender = get_gender(speaker)
                        # Convert speaker name to title case for output
                        speaker_title = speaker.title()
                        writer.writerow(
                            ["meld", wav_filename, emotion, gender, speaker_title]
                        )
        print(f"Successfully generated {output_file}")
    except Exception as e:
        print(f"Error processing files: {e}")


if __name__ == "__main__":
    generate_meld_csv()
