# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from scipy.special import expit

# Load datasets
pitt_labeled = pd.read_csv('pitt_dat_cleaned_with_transcripts.csv')
hopkins_unlabeled = pd.read_csv('hopkins_synthetic_sample_n1000.csv')

# for hopkins data, rm File, Date.of.Audio..taped.Language.Sample, X1Date.of.Testing,
# Load datasets
pitt_labeled = pd.read_csv('pitt_dat_cleaned_with_transcripts.csv')
hopkins_unlabeled = pd.read_csv('hopkins_synthetic_sample_n1000.csv')

# for hopkins data, rm File, Date.of.Audio..taped.Language.Sample, X1Date.of.Testing,

# for pitt data, rm id, idate, dx1, dx2, dx3

# Drop specific columns from Hopkins data
hopkins_unlabeled_cleaned = hopkins_unlabeled.drop(columns=[
    'File',
    'Date.of.Audio..taped.Language.Sample',
    'X1Date.of.Testing'
], errors='ignore')  # 'errors=ignore' in case the column name slightly differs or missing

# Drop specific columns from Pitt data
pitt_labeled_cleaned = pitt_labeled.drop(columns=[
    'id',
    'idate',
    'dx1',
    'dx2',
    'dx3'
], errors='ignore')

"""### NLP features"""

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
from nltk import pos_tag

import shutil
shutil.rmtree('/root/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/local/share/nltk_data', ignore_errors=True)

import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

nan_indices = pitt_labeled_cleaned[pitt_labeled_cleaned['Transcripts'].isna()].index
# Display indices (or rows) with NaN transcripts
print("Indices of individuals with NaN transcripts:")
print(nan_indices.tolist())

"""## Add NLP features to the dataframe"""

# Define function to extract NLP features from a single transcript
def extract_nlp_features(chat_transcript):
    if pd.isna(chat_transcript):
        return pd.Series({
            'total_words': np.nan,
            'unique_words': np.nan,
            'type_token_ratio': np.nan,
            'mean_utterance_length': np.nan,
            'noun_ratio': np.nan,
            'pronoun_ratio': np.nan,
            'verb_ratio': np.nan,
            'filler_word_ratio': np.nan,
            'most_common_words': np.nan
        })

    utterances_raw = re.findall(r'\*PAR:\t(.+)', chat_transcript)

    cleaned_utterances = []
    for utt in utterances_raw:
        utt_clean = utt.lower()
        utt_clean = re.sub(r'\[.*?\]|<.*?>|\+<|&-|[%\\@].*', '', utt_clean)
        utt_clean = re.sub(r'[^a-z\s]', '', utt_clean)
        utt_clean = re.sub(r'\s+', ' ', utt_clean).strip()
        if utt_clean:
            cleaned_utterances.append(utt_clean)

    stop_words = set(stopwords.words('english'))
    total_words, unique_words = [], set()
    sentence_lengths, noun_counts, pronoun_counts, verb_counts = [], [], [], []
    filler_word_counts = 0
    fillers = {'uh', 'um', 'ah', 'er'}

    for utt in cleaned_utterances:
        words = utt.split()
        words_clean = [w for w in words if w.isalpha()]
        total_words.extend(words_clean)
        unique_words.update(words_clean)

        sentence_lengths.append(len(words_clean))

        try:
            pos_tags = pos_tag(words_clean) if words_clean else []
        except LookupError:
            pos_tags = []

        nouns = len([w for w, p in pos_tags if p.startswith('NN')])
        pronouns = len([w for w, p in pos_tags if p.startswith('PRP')])
        verbs = len([w for w, p in pos_tags if p.startswith('VB')])

        noun_counts.append(nouns)
        pronoun_counts.append(pronouns)
        verb_counts.append(verbs)

        filler_word_counts += sum(1 for w in words if w in fillers)

    return pd.Series({
        'total_words': len(total_words),
        'unique_words': len(unique_words),
        'type_token_ratio': len(unique_words) / len(total_words) if total_words else np.nan,
        'mean_utterance_length': np.mean(sentence_lengths) if sentence_lengths else np.nan,
        'noun_ratio': sum(noun_counts) / len(total_words) if total_words else np.nan,
        'pronoun_ratio': sum(pronoun_counts) / len(total_words) if total_words else np.nan,
        'verb_ratio': sum(verb_counts) / len(total_words) if total_words else np.nan,
        'filler_word_ratio': filler_word_counts / len(total_words) if total_words else np.nan,
        'most_common_words': Counter(total_words).most_common(10) if total_words else np.nan
    })

# Apply the feature extraction to all transcripts
nlp_features_df = pitt_labeled_cleaned['Transcripts'].apply(extract_nlp_features)

# Concatenate NLP features to original DataFrame
pitt_labeled_with_nlp = pd.concat([pitt_labeled_cleaned, nlp_features_df], axis=1)


## do the same for hopkins data
nlp_features_df_2 = hopkins_unlabeled['Transcripts'].apply(extract_nlp_features)
# Concatenate NLP features to original DataFrame
hopkins_unlabeled_with_nlp = pd.concat([hopkins_unlabeled, nlp_features_df_2], axis=1)


# Display or save the resulting DataFrame
print(pitt_labeled_with_nlp.head())
print(hopkins_unlabeled_with_nlp.head())

"""## Transform type token ratio and filled_word_ratio"""

# Transform 'type_token_ratio' into specified categories
def categorize_ttr(ttr):
    if pd.isna(ttr):
        return np.nan
    elif ttr >= 0.6:
        return 'High lexical diversity'
    elif 0.5 <= ttr < 0.6:
        return 'Moderate lexical diversity'
    elif 0.4 <= ttr < 0.5:
        return 'Low-moderate lexical diversity'
    else:  # ttr < 0.4
        return 'Low lexical diversity'

# Apply the categorization to the DataFrame
pitt_labeled_with_nlp['ttr_category'] = pitt_labeled_with_nlp['type_token_ratio'].apply(categorize_ttr)
# Apply the categorization to the DataFrame
hopkins_unlabeled_with_nlp['ttr_category'] = hopkins_unlabeled_with_nlp['type_token_ratio'].apply(categorize_ttr)

# Display result
print(pitt_labeled_with_nlp[['type_token_ratio', 'ttr_category']].head())
print(hopkins_unlabeled_with_nlp[['type_token_ratio', 'ttr_category']].head())

# Transform 'filler_word_ratio' into categories
def categorize_filler_ratio(ratio):
    if pd.isna(ratio):
        return np.nan
    elif ratio < 0.03:
        return 'Normal'
    elif 0.03 <= ratio < 0.05:
        return 'Mildly elevated'
    else:  # ratio >= 0.05
        return 'Elevated'

# Apply categorization to the DataFrame
pitt_labeled_with_nlp['filler_ratio_category'] = pitt_labeled_with_nlp['filler_word_ratio'].apply(categorize_filler_ratio)
# Apply categorization to the DataFrame
hopkins_unlabeled_with_nlp['filler_ratio_category'] = hopkins_unlabeled_with_nlp['filler_word_ratio'].apply(categorize_filler_ratio)

# Display results
print(pitt_labeled_with_nlp[['filler_word_ratio', 'filler_ratio_category']].head())
print(hopkins_unlabeled_with_nlp[['filler_word_ratio', 'filler_ratio_category']].head())

pitt_labeled_with_nlp.to_csv('pitt_labeled_with_nlp.csv', index=False)

hopkins_unlabeled_with_nlp.to_csv('hopkins_unlabeled_with_nlp.csv', index=False)
