#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 20 16:47:39 2025

"""

import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import numpy as np
from sklearn.decomposition import PCA

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

splits = {'train': 'data/train-00000-of-00001-b42a775f407cee45.parquet', 'validation': 'data/validation-00000-of-00001-134b8fd0c89408b6.parquet'}
df = pd.read_parquet("hf://datasets/OpenAssistant/oasst1/" + splits["train"])


df_cleaned = df.dropna(subset=['labels']) # Very few (<1%) observations with missingness, not interesting. Discard them and simulate some missingness
df_cleaned = df_cleaned.reset_index(drop=True)


# First clustered dataset
embeddings_cluster = np.zeros((len(df_cleaned),384))
for i in range(len(df_cleaned)):
    embeddings_cluster[i,:] = Settings.embed_model.get_text_embedding(df_cleaned['text'].iloc[i])
    
    
    
# Reduce the dimensionality from 384 to 64 using PCA
pca = PCA(n_components=64)  # Choose the target dimension
reduced_embeddings_cluster = pca.fit_transform(embeddings_cluster)

outcomes = np.zeros((len(df_cleaned),4))
for i in range(len(df_cleaned)):
    if np.sum(df_cleaned['labels'][i]['name']=='humor')>0:
        outcomes[i,0] = df_cleaned['labels'][i]['value'][df_cleaned['labels'][i]['name']=='humor']
    else:
        outcomes[i,0] = -1
    if np.sum(df_cleaned['labels'][i]['name']=='toxicity')>0:
        outcomes[i,1] = df_cleaned['labels'][i]['value'][df_cleaned['labels'][i]['name']=='toxicity'] 
    else:
        outcomes[i,1] = -1
    if np.sum(df_cleaned['labels'][i]['name']=='quality')>0:
        outcomes[i,2] = df_cleaned['labels'][i]['value'][df_cleaned['labels'][i]['name']=='quality']
    else:
        outcomes[i,2] = -1
    if np.sum(df_cleaned['labels'][i]['name']=='creativity')>0:
        outcomes[i,3] = df_cleaned['labels'][i]['value'][df_cleaned['labels'][i]['name']=='creativity'] 
    else:
        outcomes[i,3] = -1

ind = np.all(outcomes >= 0, axis=1)
outcomes = outcomes[ind]     
temp = df_cleaned[ind]   

lang = np.array(temp['lang']).reshape((-1,1))
role = np.array(temp['role']).reshape((-1,1))
cid = np.array(temp['message_tree_id']).reshape((-1,1))

colnames = [f'W{i+1}' for i in range(64)]
colnames.extend(['W65','X', 'humor','toxicity','quality','creativity','cid'])
data_cluster = pd.DataFrame(np.hstack((reduced_embeddings_cluster[ind], role, lang, outcomes, cid)), columns=colnames)
data_cluster.to_csv("data_cluster.csv")


# Second sequential dataset




embeddings_sequential = np.zeros((len(df_cleaned),384))
for i in range(len(df_cleaned)):
    cumu_text = df_cleaned['text'].iloc[i]
    k = i
    while df_cleaned['parent_id'].iloc[k] is not None:
        j = k
        while df_cleaned['message_id'].iloc[j] != df_cleaned['parent_id'].iloc[k]:
            j=j-1
        cumu_text = df_cleaned['text'].iloc[j] + cumu_text
        k = j
    embeddings_sequential[i,:] = Settings.embed_model.get_text_embedding(cumu_text)


# Reduce the dimensionality from 384 to 64 using PCA
reduced_embeddings_sequential = pca.fit_transform(embeddings_sequential)

colnames = [f'S{i+1}' for i in range(64)]
colnames.extend(['S65','X', 'humor','toxicity','quality','creativity','cid'])
data_sequential = pd.DataFrame(np.hstack((reduced_embeddings_sequential[ind], role, lang, outcomes, cid)), columns=colnames)
data_sequential.to_csv("data_sequential.csv")


