import sys
from memory_profiler import profile
import os, psutil

import spacy
import configparser
from scrc.preprocessors.abstract_preprocessor import AbstractPreprocessor
from root import ROOT_DIR
from scrc.utils.log_utils import get_logger

# import scrc.utils.monkey_patch  # prevent memory leak with pandas

# IMPORTANT: make sure you download these models first with: python -m spacy download de_dep_news_trf
import de_core_news_lg, fr_core_news_lg, it_core_news_lg
from scrc.utils.main_utils import get_config

from scrc.utils.slack_util import post_message_to_slack


# TODO find out how to deal with deadlocks
#  A: prevent deadlock
#  B: restart program when deadlock is detected
#  C: for diagnosis print threaddump


# TODO XYZ passt so an, dass dieser Komponent nur noch die anzahl tokens berechnet sobald die neue DB hier ist
class NlpPipelineRunner(AbstractPreprocessor):
    """
    Runs the entire spacy pipeline for each text and saves it into the MongoDB.
    This brings the advantage, that we have the heavy computation done in advance,
    and can then use the spacy objects directly in our analysis.

    Here is a very good resource to reduce memory consumption: https://pythonspeed.com/memory/
    """

    def __init__(self, config: dict):
        super().__init__(config)
        self.logger = get_logger(__name__)

        self.models = {
            'de': 'de_core_news_lg',
            'fr': 'fr_core_news_lg',
            'it': 'it_core_news_lg'
        }
        # tag, pos and lemma are enough for now
        self.disable_pipes = ['senter', 'ner', 'attribute_ruler', 'textcat']
        self.active_spacy_model = None
        self.active_bert_tokenizer = None

    @staticmethod
    def load_spacy_model(model_name, disable_pipes):
        return spacy.load(model_name, disable=disable_pipes)

    def run_pipeline(self):
        self.logger.info("Started running spacy pipeline on the texts")

        for lang in self.languages:
            self.logger.info(f"Started processing language {lang}")
            lang_dir = self.create_dir(self.spacy_subdir, lang)  # output dir

            processed_file_path = self.progress_dir / f"{lang}_spiders_spacied.txt"
            spider_list, message = self.compute_remaining_spiders(processed_file_path)
            self.logger.info(message)

            if spider_list:
                self.load_language_models(lang, lang_dir)

            engine = self.get_engine(self.db_scrc)
            # add new columns for num_tokens
            self.add_column(engine, lang, col_name='num_tokens_spacy', data_type='bigint')
            self.add_column(engine, lang, col_name='num_tokens_bert', data_type='bigint')

            for spider in spider_list:
                # according to docs you should aim for a partition size of 100MB
                # 1 court decision takes approximately between around 10KB and 100KB of RAM when loaded into memory
                # The spacy doc takes about 25x the size of a court decision
                self.run_nlp_pipeline(engine, spider, lang, lang_dir)
                self.mark_as_processed(processed_file_path, spider)

            self.logger.info(f"Finished processing language {lang}")

        self.logger.info("Finished running spacy pipeline on the texts")

    def load_language_models(self, lang, lang_dir):
        self.logger.info("Loading spacy model")
        self.active_spacy_model = self.load_spacy_model(self.models[lang], self.disable_pipes)
        # increase max length for long texts: Can lead to memory allocation errors for parser and ner
        self.active_spacy_model.max_length = 3000000
        self.active_spacy_model.vocab = self.load_vocab(lang_dir)

        # calculate both the num_tokens for regular words and subwords
        spacy_tokenizer, self.active_bert_tokenizer = self.get_tokenizers(lang)

    @profile
    def run_nlp_pipeline(self, engine, spider, lang, lang_dir):
        """
        Creates and saves the docs generated by the spacy pipeline.
        """
        self.logger.info(f"Processing spider {spider}")

        self.run_nlp_pipe(engine, lang, lang_dir, f"spider='{spider}'", self.active_spacy_model, self.active_bert_tokenizer, self.logger)

        memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 3
        message = f"Your running process is currently using {memory_usage:.3f} GB of memory"
        self.logger.info(message)
        try:
            post_message_to_slack(message)
        except:
            self.logger.error("Could not send message to slack: ", sys.exc_info())


if __name__ == '__main__':
    config = get_config()

    nlp_pipeline_runner = NlpPipelineRunner(config)
    nlp_pipeline_runner.run_pipeline()
