from conversation import DataConversation, ConversationSet
from typing import List
import argparse


def read_data(path: str) -> ConversationSet:
    # Open file and read lines
    conv_set = ConversationSet.fromJSON(path)

    return conv_set


def get_multi_pii_convs(conv_set: ConversationSet) -> List[DataConversation]:
    all_convs: List[DataConversation] = conv_set.conversations

    # Filter conversations which have PII
    pii_convs = [conv for conv in all_convs if conv.contains_pii()]

    # Sort these messages by pii score
    pii_convs.sort(key=lambda x: x.get_pii_score(), reverse=True)

    # Print the top 10
    # for conv in pii_convs:
    #     print(str(conv))
    #     print("=================")
    multi_pii_convs = [conv for conv in pii_convs if len(conv.get_unique_pii()) > 1]

    return multi_pii_convs


def get_cv_convs(conv_set: ConversationSet) -> List[DataConversation]:
    all_convs: List[DataConversation] = conv_set.conversations

    # Filter conversations which have PII
    pii_convs = [conv for conv in all_convs if conv.contains_pii()]

    # Sort these messages by pii score
    pii_convs.sort(key=lambda x: x.get_pii_score(), reverse=True)

    cv_convs = []
    for conv in pii_convs:
        if "EMAIL_ADDRESS" in conv.get_unique_pii():
            for msg in conv.messages:
                if msg.sender == "human" and (
                    " cv" in msg.text.lower() or " resume" in msg.text.lower()
                ):
                    cv_convs.append(conv)

    return cv_convs


if __name__ == "__main__":
    # ARgument parsing
    parser = argparse.ArgumentParser(
        description="Reads a JSON file of conversations and anonymizes them"
    )
    parser.add_argument(
        "input_file",
        type=str,
        help="Path to the input file",
        default="data/sharegpt/sg_90k_en.json",
    )
    parser.add_argument(
        "output_file",
        type=str,
        help="Path to the output file",
        default="data/sharegpt/sg_90k_en_pii.json",
    )

    path = parser.input_file  # "data/sharegpt/sg_90k_en.json"

    conv_set = read_data(path)

    all_convs: List[DataConversation] = conv_set.conversations

    # Filter conversations which have PII
    pii_convs = [conv for conv in all_convs if conv.contains_pii()]

    # Sort these messages by pii score
    pii_convs.sort(key=lambda x: x.get_pii_score(), reverse=True)

    # Print the top 10
    # for conv in pii_convs:
    #     print(str(conv))
    #     print("=================")
    multi_pii_convs = [conv for conv in pii_convs if len(conv.get_unique_pii()) > 1]

    cv_convs = []
    for conv in pii_convs:
        if "EMAIL_ADDRESS" in conv.get_unique_pii():
            for msg in conv.messages:
                if msg.sender == "human" and (
                    " cv" in msg.text.lower() or " resume" in msg.text.lower()
                ):
                    cv_convs.append(conv)
                    break

    conv_set = ConversationSet(cv_convs)
    conv_json_export = conv_set.toJSON()
    with open("data/sharegpt/sg_email_cv_pii.json", "w") as f:
        f.write(conv_json_export)
