
import tokenize

from io import BytesIO

import tokenize

from io import BytesIO

from transformers import GPT2Tokenizer

import joblib

from joblib import Parallel, delayed

import contextlib

from nltk import ngrams 

from typing import List, Callable

from sacrebleu.metrics import BLEU

import re

from transformers import AutoTokenizer

from dataclasses import dataclass

from tqdm import tqdm

from parso.python.tokenize import tokenize as parso_tokenize

import numpy as np 



bleu = BLEU(tokenize=None, effective_order=True)



newline_pattern = re.compile(rr)



gpt2_tokenizer = GPT2Tokenizer.from_pretrained('')

_codebert_tokenzier = AutoTokenizer.from_pretrained("")



def codebert_tokenizer(code_str, remove_comments = False):

    if remove_comments:

        code_str = re.sub(rr, "", code_str)

    encoded_input = _codebert_tokenzier(code_str)

    tokens = _codebert_tokenzier.convert_ids_to_tokens(encoded_input[''])

    return tokens

    



def get_relevant_tokens_lexer(code_str, remove_comments = False):

    

    bytes_io = BytesIO(code_str.encode(''))

    

    

    tokens = tokenize.tokenize(bytes_io.readline)

    

    

    relevant_tokens = _get_relevant_tokens(tokens, remove_comments)

    

    return relevant_tokens



def get_relevant_tokens_parso(code_str, remove_comments = False):

    """"""

    tokens = parso_tokenize(code_str, version_info=(3, 12))

    relevant_tokens = _get_relevant_tokens(tokens, remove_comments)

    return relevant_tokens



def _get_relevant_tokens(tokens, remove_comments = False):

    irrelevant_types = {

        tokenize.ENCODING,

        tokenize.ENDMARKER,

        

        tokenize.INDENT, 

        tokenize.NL,

        

    }

    if remove_comments:

        irrelevant_types.add(tokenize.COMMENT)

    

    

    

    relevant_tokens = [] 

    for token in tokens:

        if token.type not in irrelevant_types:

            if token.type == tokenize.STRING or token.type == tokenize.COMMENT:

                relevant_tokens.extend(token.string.split(""))

            elif token.type == tokenize.DEDENT:

                relevant_tokens.append("")

            else: 

                relevant_tokens.append(token.string)

    return relevant_tokens





    







def get_relevant_tokens_tokenizer(code_str, tokenizer = codebert_tokenizer, remove_comments = False):

    

    if remove_comments:

        code_str = re.sub(rr, "", code_str)

    tokens = tokenizer.tokenize(code_str)

    return tokens







































































def tokenize_for_self_bleu(code_str, ftokenizer: Callable[[str], List[str]]) -> List[str]:

    tokens = ftokenizer(code_str)

    tokens = [newline_pattern.sub("", token) for token in tokens]

    return "".join(tokens)







def self_bleu_metric(src: str, tgt: str, ftokenizer: Callable[[str], List[str]]) -> float:

    src_tokens = tokenize_for_self_bleu(src, ftokenizer)

    tgt_tokens = tokenize_for_self_bleu(tgt, ftokenizer)

    return 100 - bleu.sentence_score(src_tokens, [tgt_tokens]).score





def iterative_corpus_self_bleu(sentences: List[str], ftokenizer: Callable[[str], List[str]], normalize: bool = True) -> float:

    total_self_bleu = 0

    n = len(sentences)

    for i in range(n):

        for j in range(i+1, n):

            total_self_bleu += self_bleu_metric(sentences[i], sentences[j], ftokenizer)

    return (total_self_bleu / (n * (n-1) / 2)) if normalize else total_self_bleu





@contextlib.contextmanager

def tqdm_joblib(tqdm_object):

    """"""

    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):

        def __call__(self, *args, **kwargs):

            tqdm_object.update(n=self.batch_size)

            return super().__call__(*args, **kwargs)



    old_batch_callback = joblib.parallel.BatchCompletionCallBack

    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback

    try:

        yield tqdm_object

    finally:

        joblib.parallel.BatchCompletionCallBack = old_batch_callback

        tqdm_object.close()

        

        

def parallel_corpus_self_bleu(sentences: List[str], ftokenizer: Callable[[str], List[str]], n_jobs: int = -1, normalize: bool = True) -> float:

    total_self_bleu = 0

    n = len(sentences)

    total_pairs = n * (n-1) / 2

    

    with tqdm_joblib(tqdm(desc="", total=total_pairs)) as progress_bar:

        total_self_bleu = sum(Parallel(n_jobs=n_jobs)(delayed(self_bleu_metric)(sentences[i], sentences[j], ftokenizer) for i in range(n) for j in range(i+1, n)))

    return (total_self_bleu / total_pairs) if normalize else total_self_bleu

    





def distinct_n(corpus: List[str], n: int, ftokenizer: Callable[[str], List[str]], remove_comments: bool = False) -> float:

    ngrams_list = [list(ngrams(ftokenizer(seq, remove_comments), n)) for seq in corpus]

    ngrams_set = set()

    for ngrams_seq in ngrams_list:

        ngrams_set.update(ngrams_seq)

    

    return len(ngrams_set) / sum(map(len, ngrams_list)) if sum(map(len, ngrams_list)) > 1 else np.nan































def ead_normalized_unique_ngrams(corpus: List[str], n: int, ftokenizer: Callable[[str], List[str]], remove_comments: bool = False, 

                                 vocab_size: int = 128256): 

    """"""

    _ngrams = [ngram for seq in corpus for ngram in ngrams(ftokenizer(seq, remove_comments), n)]

    N = len(set(_ngrams))

    C = len(_ngrams)

    V = vocab_size



    try:

        ead = N / (V * (1 - ((V - 1) / V) ** C))

    except ZeroDivisionError:

        ead = np.nan

    return ead











def could_yield_nan(sequence: str, ftokenizer: Callable[[str, bool], List[str]], n: int, remove_comments: bool) -> bool:

    

    tokens = ftokenizer(sequence, remove_comments)

    return len(tokens) < n





def bootstrap_distinct_n(corpus: List[str], n: int, ftokenizer: Callable[[str, bool], List[str]], remove_comments: bool, iterations: int = 100, subsample_size: int = 100) -> float:

    import random

    from random import choices

    random.seed(42)



    

    filtered_corpus = [seq for seq in corpus if seq and seq.strip()]



    

    filtered_corpus = [seq for seq in filtered_corpus if not could_yield_nan(seq, ftokenizer, n, remove_comments)]

    

    if len(filtered_corpus) < subsample_size:

        return np.nan



    stats = []

    for _ in range(iterations):

        sub_sample = choices(filtered_corpus, k=subsample_size)

        distinct_score = distinct_n(sub_sample, n, ftokenizer, remove_comments)

        stats.append(distinct_score)

    if any(np.isnan(stats)):

        print(f"Warning: some of the bootstrapped samples resulted in NaN values; the number of such samples is {np.sum(np.isnan(stats))}, ignoring them.")

    return np.nanmean(stats)  





def bootstrap_ead_normalized_unique_ngrams(corpus: List[str], n: int, ftokenizer: Callable[[str, bool], List[str]], remove_comments: bool, 

                                           iterations: int = 100, vocab_size: int = 128256, subsample_size: int = 100) -> float:

    import random

    from random import choices

    random.seed(42)



    

    filtered_corpus = [seq for seq in corpus if seq and seq.strip()]



    

    filtered_corpus = [seq for seq in filtered_corpus if not could_yield_nan(seq, ftokenizer, n, remove_comments)]

    

    if len(filtered_corpus) < subsample_size:

        return np.nan



    stats = []

    for _ in range(iterations):

        sub_sample = choices(filtered_corpus, k=subsample_size)

        ead_score = ead_normalized_unique_ngrams(sub_sample, n, ftokenizer, remove_comments, vocab_size)

        stats.append(ead_score)

    if any(np.isnan(stats)):

        print(f"Warning: some of the bootstrapped samples resulted in NaN values; the number of such samples is {np.sum(np.isnan(stats))}, ignoring them.")

    return np.nanmean(stats)  





def jaccard_n_grams(corpus: List[str], n: int, ftokenizer: Callable[[str], List[str]], remove_comments: bool = False, iterations: int = -1) -> float:

    import random

    from random import choices

    random.seed(42)



    

    filtered_corpus = [seq for seq in corpus if seq and seq.strip()]



    

    filtered_corpus = [seq for seq in filtered_corpus if not could_yield_nan(seq, ftokenizer, n, remove_comments)]

    

    if len(filtered_corpus) < 2:

        return np.nan

    

    tokenized_corpus = [ftokenizer(seq, remove_comments) for seq in filtered_corpus]

    

    n_grams_corpus = [list(ngrams(seq, n)) for seq in tokenized_corpus]

    

    jaccard_distances = []

    

    if iterations == -1:

        for i in range(len(n_grams_corpus)):

            for j in range(i+1, len(n_grams_corpus)):

                ngrams_i = set(n_grams_corpus[i])

                ngrams_j = set(n_grams_corpus[j])

                jaccard_sim = len(ngrams_i.intersection(ngrams_j)) / len(ngrams_i.union(ngrams_j))

                jacard_dist = 1 - jaccard_sim

                jaccard_distances.append(jacard_dist)

    else:

        for _ in range(iterations):

            sub_sample = choices(n_grams_corpus, k=2)

            ngrams_i = set(ngrams(sub_sample[0], n))

            ngrams_j = set(ngrams(sub_sample[1], n))

            jaccard_sim = len(ngrams_i.intersection(ngrams_j)) / len(ngrams_i.union(ngrams_j))

            jacard_dist = 1 - jaccard_sim

            jaccard_distances.append(jacard_dist)

            

    return np.nanmean(jaccard_distances)  

    

