import pandas as pd
import numpy as np

# Assume the data has been read into a DataFrame, named df
df = pd.read_csv('merged_result.csv')

# Create a function to process the data
def process_data(df):
    # Ensure column names are correct
    required_columns = ['STUDY_PRTCPT_ID', 'date_day', 'study_week', 'STEP_COUNT', 'SLEEP_COUNT', 'MOOD', 'NOTIFICATION_WAS_SENT']
    if not set(required_columns).issubset(df.columns):
        raise ValueError(f"Missing necessary columns: {set(required_columns) - set(df.columns)}")

    # Convert the 'date_day' column to datetime format
    df['date_day'] = pd.to_datetime(df['date_day'])

    # Group by STUDY_PRTCPT_ID and study_week
    grouped = df.groupby(['STUDY_PRTCPT_ID', 'study_week'])

    # Calculate the mean of STEP_COUNT and censor_step
    result = grouped['STEP_COUNT'].agg(
        avg_step=('mean'),
        censor_step=(lambda x: 1 if x.isna().any() else 0)
    ).reset_index()

    # Calculate the mean of SLEEP_COUNT and censor_sleep
    sleep = grouped['SLEEP_COUNT'].agg(
        avg_sleep=('mean'),
        censor_sleep=(lambda x: 1 if x.isna().any() else 0)
    ).reset_index()

    # Calculate the sum of MOOD and censor_mood
    mood = grouped['MOOD'].agg(
        avg_mood=('sum'),
        censor_mood=(lambda x: 0 if x.isna().any() else 1)
    ).reset_index()

    # Merge the results
    result = result.merge(sleep, on=['STUDY_PRTCPT_ID', 'study_week'])
    result = result.merge(mood, on=['STUDY_PRTCPT_ID', 'study_week'])

    # Get the first 'NOTIFICATION_WAS_SENT' value for each group based on the earliest 'date_day'
    action = grouped.apply(lambda x: x.sort_values('date_day').iloc[0]['NOTIFICATION_WAS_SENT']).reset_index(name='action')

    # Merge the action into the result
    result = result.merge(action, on=['STUDY_PRTCPT_ID', 'study_week'])

    # Remove records with NA values in key columns
    result = result.dropna(subset=['avg_step', 'avg_sleep', 'avg_mood'])

    # Count whether all data for each STUDY_PRTCPT_ID is complete
    complete_data = result.groupby('STUDY_PRTCPT_ID').apply(
        lambda x: 1 if (
            x['avg_step'].notna().all() and 
            x['avg_sleep'].notna().all() and 
            x['avg_mood'].notna().all()
        ) else 0
    ).reset_index(name='data_complete')

    # Count the number of participants with complete data
    complete_count = complete_data['data_complete'].sum()
    print(f"Number of participants with complete data: {complete_count}")

    return result, complete_count

# Call the function to process the data
processed_data, complete_count = process_data(df)

# Save the result to a new CSV file
processed_data.to_csv('processed_data.csv', index=False)
