import os
from conversation import Message, DataConversation, ConversationSet, PiiResults, PiiResult
from presidio_analyzer import AnalyzerEngine, 
from presidio_anonymizer import AnonymizerEngine
from langdetect import detect, detect_langs
from typing import List
import json


paths = ["data/sharegpt/sg_90k_part1.json", "data/sharegpt/sg_90k_part2.json"] # "data/test_data.json"] #,
relevant_entities = ["EMAIL_ADDRESS", "PHONE_NUMBER", "LOCATION", "NRP"]

def is_lang(text, lang):
    try:
        detected_langs = detect_langs(text)
        for val in detected_langs:
            if val.lang == lang:
                return True
        return False
    except:
        return True


        
if __name__ == "__main__":
    text_to_anonymize = " I am 50 years old - His name is Mr. Jones and his phone number is 212-555-5555 +41782390333 - my mail is jones@rutgers.ch and I am of mexican descent and muslim and christian and from spain. I am from Puerto Rico. I like to buy apples and oranges."

    analyzer = AnalyzerEngine()
    analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=relevant_entities, language='en')

    print(analyzer_results)
    
    anonymizer = AnonymizerEngine()
    # anonymized_results = anonymizer.anonymize(
    # text=text_to_anonymize,
    # analyzer_results=analyzer_results,    
    # operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}), 
    #                     "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True})}
    # )
    
    #print(anonymized_results)
    
    relevant_convs = []

    for path in paths:

        with open(path, "r") as f:
            
            json_f = json.load(f)
            
            for row in json_f:
                conv = row["conversations"]
                is_en = True
                
                first_text = conv[0]["value"]
                if not is_lang(first_text, "en"):
                    continue
                if len(first_text) > 500:
                    if not is_lang(first_text[-int(0.1*len(first_text)):], "en") or not is_lang(first_text[:int(0.1*len(first_text))], "en"):
                        continue
                
                msg_list: List[Message] = []


                for i, conv_entry in enumerate(conv):
                    text = conv_entry["value"]
                    
                    if not is_lang(text, "en"):
                        is_en = False
                        break
                    
                    msg = Message(sender=conv_entry["from"], text=text)
                    msg_list.append(msg)

                    if conv_entry["from"] == "human":
                        an_res =analyzer.analyze(text=text, entities=relevant_entities, language="en")
                        pii_results = []

                        for r in an_res:
                            r.orig = text[r.start:r.end]
                            pii_results.append(PiiResult(entity=r.entity_type, start=r.start, end=r.end, orig=r.orig, new_value="<ANONYMIZED>"))
                        
                        msg.set_pii_result(PiiResults(pii_results))
                    else:
                        continue
                    
                    # Store conversation here
                if not is_en:
                    continue
                relevant_convs.append(DataConversation(id=row["id"], origin="sharegpt", language="en", messages=msg_list))


    # Write anonymized data to JSON file
    conv_set = ConversationSet(relevant_convs)
    q = conv_set.toJSON()

    if not os.path.exists("data/sharegpt"):
        os.makedirs("data/sharegpt")
    with open("data/sharegpt/sg_90k_en.json", "w") as f:
        f.write(q)


    