# Import helper functions for input/output
from lmtune_helpers import input_data, output_results

# Import standard libraries for data analysis
import networkx as nx
import numpy as np
import math


def compute_entropy(values, n_bins=10):
    """Return Shannon entropy of the empirical distribution obtained with `n_bins` bins."""
    if len(values) == 0:
        return 0.0
    hist, _ = np.histogram(values, bins=n_bins, density=False)
    total = hist.sum()
    if total == 0:
        return 0.0
    probs = hist / total
    probs = probs[probs > 0]
    return float(-(probs * np.log2(probs)).sum())


def sample_skewness(values):
    """Unbiased Fisher sample skewness."""
    n = len(values)
    if n < 3:
        return 0.0
    mean = np.mean(values)
    std = np.std(values, ddof=1)
    if std == 0:
        return 0.0
    m3 = np.mean((values - mean) ** 3)
    return float((math.sqrt(n * (n - 1)) / (n - 2)) * (m3 / (std ** 3)))


def sample_kurtosis(values):
    """Unbiased Fisher excess kurtosis."""
    n = len(values)
    if n < 4:
        return 0.0
    mean = np.mean(values)
    std = np.std(values, ddof=1)
    if std == 0:
        return 0.0
    m4 = np.mean((values - mean) ** 4)
    g2 = (n * (n + 1) * m4 - 3 * (std ** 4) * (n - 1)) / ((n - 1) * (n - 2) * (n - 3) * (std ** 4))
    return float(g2)


def build_graph(dist_matrix):
    """Return an undirected weighted NetworkX graph from the dense distance matrix."""
    size = dist_matrix.shape[0]
    G = nx.Graph()
    for i in range(size):
        for j in range(i + 1, size):
            G.add_edge(i, j, weight=float(dist_matrix[i, j]))
    return G


def analyze_instance(instance):
    """Compute 50 descriptive features of the VRP instance."""
    N = int(instance.get("N", 0))
    capacity = float(instance.get("Capacity", 0))
    demand = np.asarray(instance.get("Demand", []), dtype=float)
    flat_distance = np.asarray(instance.get("Distance", []), dtype=float)

    num_nodes = N + 1  # depot + customers
    expected = num_nodes ** 2
    if flat_distance.size != expected:
        raise ValueError("Distance array length does not match (N+1)^2.")
    dist_matrix = flat_distance.reshape((num_nodes, num_nodes))

    # Graph representation
    G = build_graph(dist_matrix)

    # Pairwise distances (upper triangle) for stats
    tri_u = np.triu_indices(num_nodes, k=1)
    pw_dists = dist_matrix[tri_u]

    # Depot distances (row 0, excluding depot)
    depot_dists = dist_matrix[0, 1:]

    # Basic demand stats
    total_demand = float(demand.sum())
    avg_demand = float(demand.mean()) if demand.size else 0.0
    std_demand = float(demand.std(ddof=1)) if demand.size > 1 else 0.0
    min_demand = float(demand.min()) if demand.size else 0.0
    max_demand = float(demand.max()) if demand.size else 0.0

    # Demand distribution descriptors
    demand_entropy = compute_entropy(demand, n_bins=min(10, max(len(demand), 1)))
    demand_skewness = sample_skewness(demand)
    demand_kurtosis = sample_kurtosis(demand)

    # Distance stats
    avg_pw = float(pw_dists.mean()) if pw_dists.size else 0.0
    std_pw = float(pw_dists.std(ddof=1)) if pw_dists.size > 1 else 0.0
    min_pw = float(pw_dists.min()) if pw_dists.size else 0.0
    max_pw = float(pw_dists.max()) if pw_dists.size else 0.0
    distance_entropy = compute_entropy(pw_dists, n_bins=10)
    pw_skew = sample_skewness(pw_dists)

    # Depot distance stats
    avg_depot = float(depot_dists.mean()) if depot_dists.size else 0.0
    std_depot = float(depot_dists.std(ddof=1)) if depot_dists.size > 1 else 0.0
    min_depot = float(depot_dists.min()) if depot_dists.size else 0.0
    max_depot = float(depot_dists.max()) if depot_dists.size else 0.0

    # Graph metrics
    avg_deg = float(np.mean([deg for _, deg in G.degree()] ))
    dens = nx.density(G)
    clust = nx.average_clustering(G)

    # Shortest path stats
    sp_lengths = []
    for source, lengths in nx.all_pairs_dijkstra_path_length(G, weight='weight'):
        sp_lengths.extend([l for tgt, l in lengths.items() if tgt > source])
    if sp_lengths:
        graph_diameter = float(max(sp_lengths))
        graph_avg_sp = float(np.mean(sp_lengths))
    else:
        graph_diameter = graph_avg_sp = 0.0

    # MST metrics
    mst = nx.minimum_spanning_tree(G, weight='weight')
    mst_edges = [d['weight'] for _, _, d in mst.edges(data=True)]
    mst_total_len = float(sum(mst_edges))
    mst_std_edge = float(np.std(mst_edges, ddof=1)) if len(mst_edges) > 1 else 0.0

    # Depot centralities
    deg_cent = nx.degree_centrality(G).get(0, 0.0)
    clos_cent = nx.closeness_centrality(G, distance='weight').get(0, 0.0)
    betw_cent = nx.betweenness_centrality(G, weight='weight').get(0, 0.0)

    # Ratios & route estimates
    demand_cap_ratio = total_demand / capacity if capacity else 0.0
    avg_dem_cap_ratio = avg_demand / capacity if capacity else 0.0
    max_dem_cap_ratio = max_demand / capacity if capacity else 0.0
    pct_high_demand = float(np.mean(demand > 0.8 * capacity)) if capacity else 0.0
    est_min_routes = int(math.ceil(total_demand / capacity)) if capacity else 0
    cap_util = total_demand / (capacity * est_min_routes) if capacity and est_min_routes else 0.0
    est_avg_cust_per_route = N / est_min_routes if est_min_routes else 0.0

    # Edge stats & symmetry
    std_edge_weight = std_pw
    n_edges = G.number_of_edges()
    sym_pairs = np.sum(np.isclose(dist_matrix, dist_matrix.T)) - num_nodes  # exclude diagonal
    total_pairs = num_nodes * (num_nodes - 1)
    symmetry_ratio = float(sym_pairs / total_pairs) if total_pairs else 1.0

    cv_demand = std_demand / avg_demand if avg_demand else 0.0
    cv_pw = std_pw / avg_pw if avg_pw else 0.0

    # Connectivity
    edge_conn = float(nx.edge_connectivity(G)) if num_nodes > 1 else 0.0
    node_conn = float(nx.node_connectivity(G)) if num_nodes > 2 else 0.0

    # Correlation between demand and distance to depot
    dist_dem_corr = float(np.corrcoef(demand, depot_dists)[0,1]) if demand.size>1 else 0.0

    pw_var = float(np.var(pw_dists, ddof=1)) if pw_dists.size > 1 else 0.0

    # Prepare results (README placeholder)
    res = {
        "README": "placeholder",
        "n_customers": N,
        "vehicle_capacity": capacity,
        "total_demand": total_demand,
        "avg_demand": avg_demand,
        "std_demand": std_demand,
        "min_demand": min_demand,
        "max_demand": max_demand,
        "demand_entropy": demand_entropy,
        "demand_skewness": demand_skewness,
        "demand_kurtosis": demand_kurtosis,
        "avg_distance_to_depot": avg_depot,
        "std_distance_to_depot": std_depot,
        "min_distance_to_depot": min_depot,
        "max_distance_to_depot": max_depot,
        "avg_pairwise_distance": avg_pw,
        "std_pairwise_distance": std_pw,
        "min_pairwise_distance": min_pw,
        "max_pairwise_distance": max_pw,
        "distance_entropy": distance_entropy,
        "pairwise_distance_skewness": pw_skew,
        "graph_avg_degree": avg_deg,
        "graph_density": dens,
        "graph_clustering_coefficient": clust,
        "graph_diameter": graph_diameter,
        "graph_avg_shortest_path": graph_avg_sp,
        "mst_total_length": mst_total_len,
        "mst_std_edge_length": mst_std_edge,
        "depot_degree_centrality": deg_cent,
        "depot_closeness_centrality": clos_cent,
        "depot_betweenness_centrality": betw_cent,
        "demand_to_capacity_ratio": demand_cap_ratio,
        "avg_demand_to_capacity_ratio": avg_dem_cap_ratio,
        "max_demand_to_capacity_ratio": max_dem_cap_ratio,
        "pct_customers_high_demand": pct_high_demand,
        "estimated_min_n_routes": est_min_routes,
        "capacity_utilization_estimate": cap_util,
        "est_avg_customers_per_route": est_avg_cust_per_route,
        "std_edge_weight": std_edge_weight,
        "n_edges": n_edges,
        "symmetry_ratio": symmetry_ratio,
        "cv_demand": cv_demand,
        "cv_pairwise_distance": cv_pw,
        "max_edge_weight": max_pw,
        "min_edge_weight": min_pw,
        "num_nodes": num_nodes,
        "distance_matrix_rank_estimate": float(np.linalg.matrix_rank(dist_matrix)),
        "graph_edge_connectivity": edge_conn,
        "graph_vertex_connectivity": node_conn,
        "distance_correlation_depot_demand": dist_dem_corr,
        "pairwise_distance_variance": pw_var,
    }

    # Build README (~200 words)
    readme = (
        "The extractor first reshapes the flattened distance array into a square (N+1)×(N+1) matrix and models it as an "
        "undirected weighted graph using NetworkX. Each vertex represents the depot or a customer, while edge weights "
        "correspond to inter-node travel costs. Global graph metrics (average degree, density, clustering coefficient, "
        "diameter, average shortest-path length) capture connectivity richness which affects propagation of the MTZ "
        "sub-tour constraints. Minimum-spanning-tree length gives a natural lower bound on tour cost, and its edge-length "
        "dispersion highlights spatial irregularity that can mislead uniform value-ordering heuristics. Depot degree, "
        "closeness and betweenness centralities quantify how pivotal the depot is—important because many search strategies "
        "branch on arcs adjacent to node 0. Statistical moments, entropy and coefficients of variation of customer demands "
        "and pairwise distances summarise resource pressure and spatial spread. Ratios that relate demand to vehicle "
        "capacity together with an estimated minimum route count provide a fast feasibility indicator and determine how "
        "tight capacity and in-degree constraints will be. Symmetry ratio checks metric consistency, while graph edge and "
        "vertex connectivity reflect robustness of the network representation. Altogether the 50 numerical features form a "
        "small yet expressive fingerprint that downstream AutoML modules can exploit to tune solver parameters such as "
        "search strategy, restart policy and propagation strength to the structure of the specific VRP instance.")

    res["README"] = readme

    # Ensure exactly 51 keys (README + 50 features)
    assert len(res) == 51, f"Expected 51 keys, got {len(res)}"
    return res


def main():
    try:
        data = input_data()
        results = analyze_instance(data)
    except Exception as e:
        results = {"README": f"Error during analysis: {e}"}
    output_results(results)


if __name__ == "__main__":
    main()
