import numpy as np
import spacy
import re
from spacy.matcher import Matcher
import benepar


def add_inc_and_exc_matchers(nlp):
    # create inclusion and exclusion matcher for our concept of concept
    incpattern1 = [{"POS":{"IN": ["NOUN", "PRON"]}},{"POS":"AUX","OP":"?"},{"POS":"PART","OP":"?"},{"POS": "VERB"}]
    incpattern2 = [{"POS":{"IN": ["NOUN", "PRON"]}},{"TEXT": {"IN": ["'s","was", "were", "is", "are"]}},{"OP":"+"}]
    excpattern = [{"POS": "SCONJ"}]
    #
    incmatcher = Matcher(nlp.vocab)
    excmatcher = Matcher(nlp.vocab)
    #
    if spacy.__version__.startswith('2'):
        incmatcher.add("action/event concept 1", None, incpattern1)
        #pattern = [{"POS":"VERB"},{"OP":"*"},{"POS": "NOUN"}]
        #matcher.add("verb concept 3", None, pattern)
        #pattern = [{"POS":"VERB"},{"OP":"*"},{"POS": "PRON"}]
        #matcher.add("verb concept 4", None, pattern)
        incmatcher.add("state concept 1", None, incpattern2)
        excmatcher.add("compound concept", None, excpattern)
        nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
    else:
        incmatcher.add("action/event concept 1", [incpattern1])
        incmatcher.add("state concept 1", [incpattern2])
        excmatcher.add("compound concept", [excpattern])
        nlp.add_pipe("benepar", config={"model": "benepar_en3"})
    return incmatcher, excmatcher

# based on inclusion and exclusion matcher we can test for a concepts
def is_concept(span, incmatcher, exclmatcher):
    incmatches = len(list(incmatcher.__call__(span)))
    exclmatches = len(list(exclmatcher(span)))
    #print(f"span, matches = {(span.text, matches)}")
    return (incmatches > 0) and (exclmatches == 0)

def iterate_concepts(doc, incmatcher, exclmatcher, maxlen=np.inf):
    """
    iterate over concepts in multi sentence document
    """
    for sent in doc.sents:
        #print(f"sent.text = {sent.text}")
        #print(f"sent = {sent}")
        count = 0
        for concept in iterate_concepts_in_span(sent, incmatcher, exclmatcher, maxlen):
            yield concept
            count += 1
        if count == 0 and len(sent) < maxlen and is_concept(sent, incmatcher, exclmatcher):
            yield sent[:-1]

def iterate_concepts_in_span(span, incmatcher, exclmatcher, maxlen=np.inf):            
    """
    iterate over concepts in span, and recursively call on sub-spans
    """
    for child in span._.children:
        #print(f"child.text = {child.text}")
        count = 0
        for concept in iterate_concepts_in_span(child, incmatcher, exclmatcher, maxlen):
            yield concept
            count += 1
        # only yield child if sub-constituents do not match as concepts
        if count == 0 and len(span) < maxlen and is_concept(child, incmatcher, exclmatcher):
            yield child

def extract_concepts(doc, concept_dict, counter, incmatcher, excmatcher):
    """
    Helper method for capturing concepts. Takes a single text and an existing dictionary
    and identifies existing concepts as well as new concepts which it adds to the dictionary
    """
    docconcepts = []
    for concept in iterate_concepts(doc, incmatcher, excmatcher):
        concept_str = re.sub(r"[^\w'\s]",'',concept.text.lower())
        if concept_str in concept_dict:
            cid = concept_dict[concept_str]
        else:
            cid = counter
            concept_dict[concept_str] = cid
            counter += 1
        docconcepts.append(cid)
    return docconcepts, counter

def iterate_candidate_concepts(doc, incmatcher, exclmatcher, maxlen=np.inf):
    """
    iterate over concepts in multi sentence document
    """
    for sent in doc.sents:
        #print(f"sent.text = {sent.text}")
        #print(f"sent = {sent}")
        for concept in iterate_candidate_concepts_in_span(sent, incmatcher, exclmatcher, maxlen):
            yield concept
        if len(sent) < maxlen and is_concept(sent, incmatcher, exclmatcher):
            yield sent[:-1]

def iterate_candidate_concepts_in_span(span, incmatcher, exclmatcher, maxlen=np.inf):            
    """
    iterate over concepts in span, and recursively call on sub-spans
    """
    for child in span._.children:
        #print(f"child.text = {child.text}")
        for concept in iterate_candidate_concepts_in_span(child, incmatcher, exclmatcher, maxlen):
            yield concept
        # only yield child if sub-constituents do not match as concepts
        if len(span) < maxlen and is_concept(child, incmatcher, exclmatcher):
            yield child



