import os
import json
import re
from pathlib import Path
from typing import List
from presidio_analyzer import AnalyzerEngine
from conversation import (
    ConversationSet,
    DataConversation,
    Message,
    PiiResult,
    PiiResults,
)

analyzer = AnalyzerEngine()
relevant_entities = ["EMAIL_ADDRESS", "PHONE_NUMBER", "LOCATION", "NRP"]

if __name__ == "__main__":
    subject_path = Path("data/casual_conversations/CasualConversations.json")
    transcriptions_path = Path(
        "data/casual_conversations/CasualConversations_transcriptions.json"
    )

    subject_file_in = subject_path.open("r", encoding="UTF-8")
    transcriptions_file_in = transcriptions_path.open("r", encoding="UTF-8")

    conversation_list = []

    with subject_file_in:
        subject_json = json.load(subject_file_in)
        with transcriptions_file_in:
            transcriptions_json = json.load(transcriptions_file_in)

            # manual parsing of data now goes here ...

            # One conversation per subject

            for key, val in subject_json.items():
                new_conv = DataConversation(
                    id="ccv_" + str(key),
                    origin="casual_conversations",
                    language="en",
                    messages=[],
                )
                new_conv.labels = val["label"]
                conversation_list.append(new_conv)

            for i, transcript in enumerate(transcriptions_json):
                print(i)
                idx = transcript["subject_id"]
                cleaned_text = re.sub(
                    "[\(\[\<].*?[\)\]\>]", "", transcript["transcription"]
                )

                curr_msg = Message(sender="human", text=cleaned_text, pii_results=0.0)

                an_res = analyzer.analyze(
                    text=cleaned_text, entities=relevant_entities, language="en"
                )

                pii_results = []

                for r in an_res:
                    r.orig = cleaned_text[r.start : r.end]
                    pii_results.append(
                        PiiResult(
                            entity=r.entity_type,
                            start=r.start,
                            end=r.end,
                            orig=r.orig,
                            new_value="<ANONYMIZED>",
                        )
                    )

                curr_msg.set_pii_result(PiiResults(pii_results))

                conversation_list[int(idx) - 1].messages.append(curr_msg)

            conv_set = ConversationSet(conversation_list)
            conv_json_export = conv_set.toJSON()

            if not os.path.exists("data/meta_ccv"):
                os.makedirs("data/meta_ccv")
            with open("data/meta_ccv/ccv_en.json", "w") as f:
                f.write(conv_json_export)
