import pandas as pd
import numpy as np
import os
import re

model_mapping = {
    'gpt-4o-mini-2024-07-18': 'gpt-4o-mini-2024-07-18',
    'ft:gpt-4o-mini-2024-07-18:camer::BOqtcdMB': 'ft:gpt-4o-mini-2024-07-18:camer:topic-split-all:BOqtcdMB',
    'ft:gpt-4o-mini-2024-07-18:camer::BOvZjzvU': 'ft:gpt-4o-mini-2024-07-18:camer:group-split-all:BOvZjzvU',
    'ft:gpt-4o-mini-2024-07-18:camer:round-split-valid:BRTJQLtG': 'ft:gpt-4o-mini-2024-07-18:camer:round-split-valid:BRTJQLtG',
    'ft:gpt-4o-mini-2024-07-18:camer::BOvS862Y': 'ft:gpt-4o-mini-2024-07-18:camer:round-split-all:BOvS862Y',
    'Llama-3.1-8B-Instruct': 'Llama-3.1-8B-Instruct',
    'llama_3.1_8b_QLoRA_May2': 'ft:Llama-3.1-8B-Instruct:round-split-valid-5epochs',
    'llama_3.1_8b_QLoRA_left': 'ft:Llama-3.1-8B-Instruct:round-split-valid-1epoch',
}

exit_survey_questions = {
    'age': 'What is your age?',
    'gender': 'What is your gender?',
    'residency': 'What is your country/region of residency?',
    'origin': 'What is your country/region of origin?',
    'education': 'What is the highest level of education you have completed?',
    'ethnicity': 'What is your race/ethnicity? (PLEASE SELECT ALL THAT APPLY)',
    'income': 'What is your household annual income?',
    'politicalIdentity': 'Generally speaking, do you usually think of yourself as a(n)...?',
    'politicalViews': 'In general, would you describe your political views as...?',
    'maritalStatus': 'Are you currently...?',
    'childrenSchool': 'If you have children, where do they go to school? (PLEASE SELECT ALL THAT APPLY)',
    'residence': 'Would you describe the place where you live as urban, suburban or rural?',
    'bibleBelief': 'Next, which of these statements comes closest to describing your feelings about the Bible?',
    'evangelical': 'Would you describe yourself as a "born-again" or evangelical Christian, or not?',
    'religion': 'What is your religious preference – are you Protestant, Roman Catholic, Jewish, Muslim, another religion or no religion?',
    'occupation': 'Which of the following best describes the kind of work you do?'
}


def extract_version(filename):
    # Extract version (v0, v1, or v2) from filename
    match = re.search(r'simulation-v(\d+)\.csv$', filename)
    if match:
        return f'v{match.group(1)}'
    return 'v0'  # default version if not found

def get_invalid_players(user_data):
    """
    Retrieves the invalid players from the user data.
    """
    # Get players without tweets in all 3 rounds
    players_with_tweets = user_data[user_data['event_type'] == 'tweet']['sender_id'].apply(lambda x: x[:5]).value_counts()
    players_missing_tweets = players_with_tweets[players_with_tweets < 3].index.tolist()
    
    # Get players without exit survey
    all_players = user_data['sender_id'].dropna().apply(lambda x: x[:5]).unique()
    players_with_survey = user_data[user_data['event_type'] == 'exit_survey']['worker_id'].dropna().apply(lambda x: x[:5]).unique()
    players_missing_survey = [p for p in all_players if p not in players_with_survey]
    
    # Get players without any message_sent
    players_with_messages = user_data[user_data['event_type'] == 'message_sent']['sender_id'].dropna().apply(lambda x: x[:5]).unique()
    players_missing_messages = [p for p in all_players if p not in players_with_messages]
    
    # Combine all invalid players
    invalid_players = list(set(players_missing_tweets + players_missing_survey + players_missing_messages))
    return invalid_players


def split_data_by_humans(input_file_path):
    # Read the input file
    df = pd.read_csv(input_file_path)
    
    # Check for duplicate tweets in each round per agent
    tweet_mask = df['event_type'] == 'tweet'
    df_tweets = df[tweet_mask].copy()
    
    # Group by round and sender to find duplicates for tweets
    duplicates = df_tweets.groupby(['chat_round_order', 'sender_id']).size().reset_index(name='count')
    duplicate_rows = duplicates[duplicates['count'] > 1]
    
    if not duplicate_rows.empty:
        # For each duplicate case, keep only the first tweet
        for _, row in duplicate_rows.iterrows():
            dupe_mask = (df['chat_round_order'] == row['chat_round_order']) & \
                       (df['sender_id'] == row['sender_id']) & \
                       (df['event_type'] == 'tweet')
            # Get indices of duplicate tweets, sorted by index
            dupe_indices = df[dupe_mask].index.sort_values()
            # Drop all but the first occurrence
            if len(dupe_indices) > 1:
                df = df.drop(dupe_indices[1:])

    # Deduplicate Post Opinion events
    opinion_mask = df['event_type'] == 'exit_survey'
    df_opinions = df[opinion_mask].copy()
    
    # Group by round and sender to find duplicates for opinions
    opinion_duplicates = df_opinions.groupby(['worker_id', 'field']).size().reset_index(name='count')
    duplicate_opinion_rows = opinion_duplicates[opinion_duplicates['count'] > 1]
    
    if not duplicate_opinion_rows.empty:
        # For each duplicate case, keep only the first opinion
        for _, row in duplicate_opinion_rows.iterrows():
            opinion_dupe_mask = (df['worker_id'] == row['worker_id']) & \
                              (df['event_type'] == 'exit_survey') & \
                              (df['field'] == row['field'])
            # Get indices of duplicate opinions, sorted by index
            opinion_dupe_indices = df[opinion_dupe_mask].index.sort_values()
            # Drop all but the first occurrence
            if len(opinion_dupe_indices) > 1:
                df = df.drop(opinion_dupe_indices[1:])
    
    # Remove rows with empty text only for message events
    message_mask = df['event_type'].isin(['message_sent', 'message_recieved'])
    df = df[
        (~message_mask) |  # Keep non-message rows
        (message_mask & df['text'].notna() & (df['text'].str.strip() != ''))  # Filter empty text only for messages
    ]
    sent_mask = df['event_type'] == 'message_sent'
    for _, row in df[sent_mask].iterrows():
        recieved_mask = (df['event_type'] == 'message_recieved') & (df['text'] == row['text'])
        if recieved_mask.empty:
            continue
        df.loc[recieved_mask, 'input_prompt'] = row['input_prompt']

    # Add exit survey questions to input_prompt
    exit_survey_mask = df['event_type'] == 'exit_survey'
    df.loc[exit_survey_mask, 'input_prompt'] = df[exit_survey_mask]['field'].map(exit_survey_questions)

    # Get the directory path and version
    dir_path = os.path.join('../../result/conversation', os.path.relpath(os.path.dirname(input_file_path), '../../result/simulation'))
    
    path_parts = dir_path.split(os.sep)
    for i, part in enumerate(path_parts):
        for old_name, new_name in model_mapping.items():
            if old_name == part:
                path_parts[i] = new_name
                break
    dir_path = os.path.join(*path_parts)
    
    version = extract_version(os.path.basename(input_file_path))
    
    # Create the output directory if it doesn't exist
    os.makedirs(dir_path, exist_ok=True)
    
    # Get unique empirica_ids
    selected_humans = df['worker_id'].unique()
    invalid_players = get_invalid_players(df)
    
    # Create separate files for each selected human
    for i, human_id in enumerate(selected_humans, 1):
        # Filter data for current human
        human_data = df[(df['worker_id'] == human_id) | (df['recipient_id'] == human_id) | (df['sender_id'] == human_id)]
        # Relabel tweet events based on sender/recipient
        tweet_mask = human_data['event_type'] == 'tweet'
        tweet_sent_mask = tweet_mask & (human_data['sender_id'] == human_id)
        tweet_received_mask = tweet_mask & (human_data['recipient_id'] == human_id)
        
        human_data.loc[tweet_sent_mask, 'event_type'] = 'tweet_sent'
        human_data.loc[tweet_received_mask, 'event_type'] = 'tweet_received'
        # Remove message_sent events where sender is not the human
        message_sent_mask = (human_data['event_type'] == 'message_sent') & (human_data['sender_id'] != human_id)
        human_data = human_data[~message_sent_mask]
        # Remove message recieved events where recipient is the human
        message_recieved_mask = (human_data['event_type'] == 'message_recieved') & (human_data['sender_id'] == human_id)
        human_data = human_data[~message_recieved_mask]
        
        # Keep only specified columns
        columns_to_keep = ['event_type', 'input_prompt', 'chat_round_order', 'sender_id',
                          'recipient_id', 'sliderValue', 'field', 'text', 'llm_text', 'agreement_level']
        human_data = human_data[columns_to_keep]
        human_data = human_data.rename(columns={'text': 'text_human', 'llm_text': 'text_llm', 'agreement_level': 'llm_agreement_level'})
        # Create output filename with _invalid_player suffix if player is invalid
        output_name = f'agent_{human_id}_{version}'
        if human_id in invalid_players:
            output_name += '_invalid_player'
        output_file = os.path.join(dir_path, f'{output_name}.csv')
        # Skip if output file already exists
        if os.path.exists(output_file):
            # print(f"File already exists for human {human_id}, skipping: {output_file}")
            continue
        
        # Save the filtered data
        human_data.to_csv(output_file, index=False)
        print(f"Created file for human {human_id}: {output_file}")

def process_directory(root_dir):
    # Walk through all subdirectories
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.csv') and filename.startswith('simulation-'):
                input_file = os.path.join(dirpath, filename)
                # print(f"Processing file: {input_file}")
                try:
                    split_data_by_humans(input_file)
                except Exception as e:
                    print(f"Error processing {input_file}: {str(e)}")

if __name__ == "__main__":
    # Replace with your root directory path
    root_directory = "../../result/simulation/"
    process_directory(root_directory)
    # print(len(os.listdir('../../result/conversation')))
    # split_data_by_humans('../../result/simulation/20250407_193242_Everything_that_happens_can_eventually_be_explained_by_science_01JR8SQVP8RJTRVRQS15N6WBVB/gpt-4o-mini-2024-07-18/simulation-v1.csv')