from collections import defaultdict
from typing import List
import json
import re
import spacy
from collections import defaultdict
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from collections import Counter
import sys
sys.path.append('/path/to/prompt_detection')
from indirect_generator import get_completion

def get_unified_template(sentences):

    prompt = f"""
    You are an expert in syntax and linguistic abstraction.

    Given the following English sentences, ignore all specific names, dates, locations, and other concrete details.
    Abstract them into one single generalized English sentence structure (template) that can cover all of them,
    replacing variable elements with placeholders.

    Sentences:
    {chr(10).join(f"{i+1}. {s}" for i, s in enumerate(sentences))}

    Output only the generalized English template sentence.
    """

    response = get_completion(prompt)
    return response.strip()


def tokenize_with_punct(text: str) -> List[str]:
    return re.findall(r"\w+|[^\w\s]", text)

def find_repeated_opening_phrases_with_original_texts(sentences: List[str], max_ngram_len: int = 5, min_sentence_count: int = 2):
    ngram_occurrences = defaultdict(list)
    ngram_to_original_text = {}

    for idx, sentence in enumerate(sentences):
        tokens = tokenize_with_punct(sentence)

        spans = []
        current_pos = 0
        for tok in tokens:
            match = re.search(re.escape(tok), sentence[current_pos:])
            if match:
                start = current_pos + match.start()
                end = current_pos + match.end()
                spans.append((start, end))
                current_pos = end
            else:
                spans.append((None, None))  # fallback

        for n in range(2, max_ngram_len + 1):
            if len(tokens) >= n:
                ngram = tuple(tokens[:n])
                ngram_occurrences[ngram].append(idx)

                if ngram not in ngram_to_original_text:
                    start_char = spans[0][0]
                    end_char = spans[n - 1][1]
                    if start_char is not None and end_char is not None:
                        raw_text = sentence[start_char:end_char]
                        ngram_to_original_text[ngram] = raw_text.strip()

    repeated = {
        k: v for k, v in ngram_occurrences.items()
        if len(set(v)) >= min_sentence_count
    }

    return repeated, ngram_to_original_text

with open("path/to/domain", "r", encoding="utf-8") as f:
    non_training_data = json.load(f)  
sentences = []
print(f"Total sentences: {len(non_training_data)}")
def get_first_sentence(text):
    for sep in [".", "?", "!"]:
        if sep in text:
            return text.split(sep)[0].strip() + sep
    return text.strip()
trigger_dictionary = {}
for index in range(24):
    sentences = []
    for item in non_training_data[10*index:10*index+10]:
        sentences.append(item["rewrite"])
    first_sentences = [get_first_sentence(s) for s in sentences]
    template = get_unified_template(first_sentences)
    print(f"Template for index {index}: {template}")
    trigger_dictionary[index] = template
with open("result/path", "w", encoding="utf-8") as f:
    json.dump(trigger_dictionary, f, ensure_ascii=False, indent=4)