"""
Functions for calculating thresholds and filtering outliers
"""

import numpy as np

def find_threshold_otsu(scores, return_separability=False):
    """
    Find optimal threshold using Otsu's algorithm through histogram analysis
    
    Args:
        scores (np.ndarray): array of scores to separate
        
    Returns:
        float: optimal threshold
    """
    # create histogram
    hist, bin_edges = np.histogram(scores, bins=50)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
    # total number of data
    total = hist.sum()

    # calculate total variance (for separability calculation)
    global_mean = np.sum(hist * bin_centers) / total
    total_variance = np.sum(hist * ((bin_centers - global_mean) ** 2)) / total
    
    # find optimal threshold, maximum variance
    best_thresh = 0
    best_between_variance = 0
    
    # calculate between-class variance for each threshold candidate
    weight1 = 0
    weight2 = total
    sum1 = 0
    sum2 = np.sum(hist * bin_centers)
    
    for i in range(len(hist)):
        weight1 += hist[i]
        weight2 -= hist[i]
        
        # skip if weight is 0
        if weight1 == 0 or weight2 == 0:
            continue
            
        sum1 += bin_centers[i] * hist[i]
        sum2 -= bin_centers[i] * hist[i]
        
        mean1 = sum1 / weight1
        mean2 = sum2 / weight2
        
        # calculate between-class variance
        between_variance = weight1 * weight2 * ((mean1 - mean2) ** 2)
        
        if between_variance > best_between_variance:
            best_between_variance = between_variance
            best_thresh = bin_centers[i]
    
    # calculate separability (η = σ_B^2 / σ_T^2)
    separability = best_between_variance / total_variance if total_variance > 0 else 0
    
    if return_separability:
        return best_thresh, separability
    else:
        return best_thresh

def filter_outliers(distances, iqr_factor=1.5):
    """
    Filter outliers using IQR method and return mask (only upper bound)
    
    Args:
        distances (np.ndarray): array of distances to filter
        iqr_factor (float): IQR multiplier (default: 1.5)
        
    Returns:
        np.ndarray: boolean mask indicating non-outliers (True)
    """
    # calculate 25% and 75% quantiles
    q1 = np.percentile(distances, 25)
    q3 = np.percentile(distances, 75)
    iqr = q3 - q1
    
    # set upper bound (lower bound is not set)
    upper_bound = q3 + iqr_factor * iqr
    
    # return True for samples below upper bound (very close samples are all kept)
    return distances <= upper_bound
