from typing import Iterable

import numpy as np

from collections import Counter
from scipy import stats

import operator


def zipfs_coefficient(tokenized_texts: Iterable[Iterable[int]], n: int = 5000) -> float:
    """

    Args:
        tokenized_texts: text to be evaluated
        n: tokens to be evaluated, tails of the distribution have high variance

    Returns:
        regression coefficient according to Zipf's law.
    """
    num_tokens = Counter()
    for text in tokenized_texts:
        num_tokens.update(text)
    xs = np.arange(1, min(len(num_tokens), n) + 1)
    ys = np.array(sorted(num_tokens.values(), key=operator.neg)[:n])
    a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys))
    return -a
