# Import helper functions for input/output
from lmtune_helpers import input_data, output_results

# Import standard libraries for data analysis
import networkx as nx
import numpy as np


def compute_skewness(data: np.ndarray) -> float:
    """Compute sample skewness of a 1-D array."""
    if data.size == 0:
        return 0.0
    mean = data.mean()
    std = data.std(ddof=0)
    if std == 0:
        return 0.0
    return float(((data - mean) ** 3).mean() / (std ** 3))


def compute_kurtosis(data: np.ndarray) -> float:
    """Compute sample kurtosis (excess kurtosis not applied)."""
    if data.size == 0:
        return 0.0
    mean = data.mean()
    std = data.std(ddof=0)
    if std == 0:
        return 0.0
    return float(((data - mean) ** 4).mean() / (std ** 4))


def shannon_entropy(data: np.ndarray, bins: int = None) -> float:
    """Approximate Shannon entropy of numeric data via histogram."""
    if data.size == 0:
        return 0.0
    if bins is None:
        bins = int(np.ceil(np.sqrt(data.size)))
    counts, _ = np.histogram(data, bins=bins)
    probs = counts[counts > 0] / counts.sum()
    return float(-(probs * np.log2(probs)).sum())


def heuristic_route_estimate(demands: np.ndarray, capacity: int) -> int:
    """Simple first-fit decreasing bin packing to estimate number of routes."""
    if capacity <= 0:
        return demands.size
    remaining = []  # capacities remaining in each route
    for demand in sorted(demands, reverse=True):
        placed = False
        for i, rem in enumerate(remaining):
            if demand <= rem:
                remaining[i] -= demand
                placed = True
                break
        if not placed:
            remaining.append(capacity - demand)
    return len(remaining)


def build_graph(dist_matrix: np.ndarray) -> nx.Graph:
    """Construct an undirected weighted complete graph from a distance matrix."""
    n = dist_matrix.shape[0]
    G = nx.Graph()
    for i in range(n):
        G.add_node(i)
    # Add edges (i<j) to avoid duplicates
    for i in range(n):
        for j in range(i + 1, n):
            G.add_edge(i, j, weight=float(dist_matrix[i, j]))
    return G


def main():
    """Extract 50 descriptive characteristics from a VRP instance."""
    instance = input_data()

    # Basic fields
    N_customers = int(instance.get("N", 0))
    capacity = int(instance.get("Capacity", 0))
    demand_list = np.array(instance.get("Demand", []), dtype=float)
    flat_distance = np.array(instance.get("Distance", []), dtype=float)

    n_nodes_total = N_customers + 1  # including depot

    # Reconstruct distance matrix
    expected_len = (n_nodes_total) ** 2
    if flat_distance.size != expected_len:
        raise ValueError("Distance array length does not match (N+1)^2.")
    dist_matrix = flat_distance.reshape((n_nodes_total, n_nodes_total))

    # Demand statistics
    total_demand = demand_list.sum()
    avg_demand = demand_list.mean() if demand_list.size else 0.0
    std_demand = demand_list.std(ddof=0) if demand_list.size else 0.0
    min_demand = demand_list.min() if demand_list.size else 0.0
    max_demand = demand_list.max() if demand_list.size else 0.0
    demand_skewness = compute_skewness(demand_list)
    demand_kurtosis = compute_kurtosis(demand_list)
    demand_variance = float(std_demand ** 2)

    # Estimated minimum vehicles (capacity constraint only)
    estimated_min_vehicles = int(np.ceil(total_demand / capacity)) if capacity > 0 else N_customers
    capacity_utilization_ratio = float(total_demand / (estimated_min_vehicles * capacity)) if capacity > 0 else 0.0

    # Distance statistics (all pairs)
    avg_distance = float(dist_matrix.mean()) if dist_matrix.size else 0.0
    std_distance = float(dist_matrix.std(ddof=0)) if dist_matrix.size else 0.0
    min_distance = float(dist_matrix.min()) if dist_matrix.size else 0.0
    max_distance = float(dist_matrix.max()) if dist_matrix.size else 0.0
    median_distance = float(np.median(dist_matrix)) if dist_matrix.size else 0.0
    percentile_25_distance = float(np.percentile(dist_matrix, 25)) if dist_matrix.size else 0.0
    percentile_75_distance = float(np.percentile(dist_matrix, 75)) if dist_matrix.size else 0.0
    distance_variance = float(std_distance ** 2)
    distance_skewness = compute_skewness(dist_matrix.flatten())
    distance_kurtosis = compute_kurtosis(dist_matrix.flatten())

    # Distances to depot (node 0)
    depot_distances = dist_matrix[0, 1:]
    avg_distance_to_depot = float(depot_distances.mean()) if depot_distances.size else 0.0
    std_distance_to_depot = float(depot_distances.std(ddof=0)) if depot_distances.size else 0.0
    min_distance_to_depot = float(depot_distances.min()) if depot_distances.size else 0.0
    max_distance_to_depot = float(depot_distances.max()) if depot_distances.size else 0.0

    # Build graph and compute metrics
    G = build_graph(dist_matrix)
    graph_density = nx.density(G)
    # Unweighted diameter (should be 1 for complete graph with >1 node)
    graph_diameter = nx.diameter(G) if G.number_of_nodes() > 1 else 0
    avg_shortest_path_length = nx.average_shortest_path_length(G, weight="weight") if G.number_of_nodes() > 1 else 0.0
    avg_clustering_coefficient = nx.average_clustering(G)

    # Minimum Spanning Tree metrics (using Prim's algorithm in nx)
    mst = nx.minimum_spanning_tree(G, weight="weight")
    mst_edges = np.array([d["weight"] for _, _, d in mst.edges(data=True)], dtype=float)
    mst_total_length = float(mst_edges.sum()) if mst_edges.size else 0.0
    mst_avg_edge_length = float(mst_edges.mean()) if mst_edges.size else 0.0
    mst_std_edge_length = float(mst_edges.std(ddof=0)) if mst_edges.size else 0.0

    # Centrality measures
    degree_centrality = nx.degree_centrality(G)
    closeness_centrality = nx.closeness_centrality(G, distance="weight")
    betweenness_centrality = nx.betweenness_centrality(G, weight="weight")

    depot_degree_centrality = float(degree_centrality.get(0, 0.0))
    depot_closeness_centrality = float(closeness_centrality.get(0, 0.0))

    avg_closeness_centrality = float(np.mean(list(closeness_centrality.values()))) if closeness_centrality else 0.0
    std_closeness_centrality = float(np.std(list(closeness_centrality.values()), ddof=0)) if closeness_centrality else 0.0
    avg_betweenness_centrality = float(np.mean(list(betweenness_centrality.values()))) if betweenness_centrality else 0.0
    std_betweenness_centrality = float(np.std(list(betweenness_centrality.values()), ddof=0)) if betweenness_centrality else 0.0

    # Demand vs distance correlation
    if depot_distances.size and demand_list.size:
        demand_distance_correlation = float(np.corrcoef(demand_list, depot_distances)[0, 1])
    else:
        demand_distance_correlation = 0.0

    # Heuristic route estimation
    heuristic_n_routes = heuristic_route_estimate(demand_list, capacity)
    avg_customers_per_route = float(N_customers / heuristic_n_routes) if heuristic_n_routes else 0.0
    capacity_utilization_estimate = float(total_demand / (heuristic_n_routes * capacity)) if capacity > 0 else 0.0

    # Rough distance estimates for routes (customers + depot back)
    estimated_total_route_distance = float(heuristic_n_routes * avg_distance_to_depot * (avg_customers_per_route + 1))
    avg_route_distance_estimate = float(estimated_total_route_distance / heuristic_n_routes) if heuristic_n_routes else 0.0

    # Demand and distance entropy
    demand_entropy = shannon_entropy(demand_list) if demand_list.size else 0.0
    distance_entropy = shannon_entropy(dist_matrix.flatten()) if dist_matrix.size else 0.0

    demand_coefficient_of_variation = float(std_demand / avg_demand) if avg_demand != 0 else 0.0
    distance_coefficient_of_variation = float(std_distance / avg_distance) if avg_distance != 0 else 0.0

    # High demand customers ratio (>50% of capacity)
    high_demand_ratio = float(np.mean(demand_list > 0.5 * capacity)) if capacity > 0 and demand_list.size else 0.0

    # Demand to distance ratio
    demand_mean_to_distance_ratio = float(avg_demand / avg_distance_to_depot) if avg_distance_to_depot != 0 else 0.0

    # Build results dictionary with standardized names for parallel script compatibility
    results = {
        "README": """This analysis constructs a weighted complete graph where vertices represent the depot (node 0) and each customer. The flattened distance array is reshaped into a square (N+1)×(N+1) matrix that serves both for graph creation and statistical investigation. Basic problem-size figures such as number of customers, total demand, and vehicle capacity are recorded because they directly determine the search space. Demand distribution moments (mean, variance, skewness, kurtosis, entropy, coefficient of variation) reflect load balancing difficulty and influence the effectiveness of capacity-based propagation. Distance statistics (mean, spread, skewness, entropy, percentiles) approximate route lengths and drive objective-value ranges.
Graph-theoretic descriptors are obtained with NetworkX: density, diameter, average weighted shortest path, clustering coefficient, centrality moments, and minimum-spanning-tree metrics capture connectivity richness, symmetry, and backbone length—all factors that affect dominance filtering and global constraints. Depot-specific centralities highlight its structural importance. A simple first-fit decreasing bin-packing heuristic provides an upper bound on the number of routes, yielding derived measures such as customers-per-route and capacity utilization. Correlating customer demands with their distance from the depot characterises geographical heterogeneity. Together, the 50 numeric features summarise size, topology, variability and estimated difficulty, enabling an automated tuner to choose informed solver parameters.""",
        "characteristic_1": N_customers,                           # n_customers
        "characteristic_2": n_nodes_total,                         # n_nodes_total
        "characteristic_3": capacity,                              # vehicle_capacity
        "characteristic_4": float(total_demand),                   # total_demand
        "characteristic_5": float(avg_demand),                     # avg_demand
        "characteristic_6": float(std_demand),                     # std_demand
        "characteristic_7": float(min_demand),                     # min_demand
        "characteristic_8": float(max_demand),                     # max_demand
        "characteristic_9": float(demand_skewness),               # demand_skewness
        "characteristic_10": float(demand_kurtosis),              # demand_kurtosis
        "characteristic_11": estimated_min_vehicles,              # estimated_min_vehicles
        "characteristic_12": float(capacity_utilization_ratio),   # capacity_utilization_ratio
        "characteristic_13": float(avg_distance),                 # avg_distance
        "characteristic_14": float(std_distance),                 # std_distance
        "characteristic_15": float(min_distance),                 # min_distance
        "characteristic_16": float(max_distance),                 # max_distance
        "characteristic_17": float(median_distance),              # median_distance
        "characteristic_18": float(percentile_25_distance),       # percentile_25_distance
        "characteristic_19": float(percentile_75_distance),       # percentile_75_distance
        "characteristic_20": float(distance_variance),            # distance_variance
        "characteristic_21": float(avg_distance_to_depot),        # avg_distance_to_depot
        "characteristic_22": float(std_distance_to_depot),        # std_distance_to_depot
        "characteristic_23": float(min_distance_to_depot),        # min_distance_to_depot
        "characteristic_24": float(max_distance_to_depot),        # max_distance_to_depot
        "characteristic_25": float(graph_density),                # graph_density
        "characteristic_26": graph_diameter,                      # graph_diameter
        "characteristic_27": float(avg_shortest_path_length),     # avg_shortest_path_length
        "characteristic_28": float(avg_clustering_coefficient),   # avg_clustering_coefficient
        "characteristic_29": float(mst_total_length),             # mst_total_length
        "characteristic_30": float(mst_avg_edge_length),          # mst_avg_edge_length
        "characteristic_31": float(mst_std_edge_length),          # mst_std_edge_length
        "characteristic_32": float(depot_degree_centrality),      # depot_degree_centrality
        "characteristic_33": float(depot_closeness_centrality),   # depot_closeness_centrality
        "characteristic_34": float(demand_mean_to_distance_ratio), # demand_mean_to_distance_ratio
        "characteristic_35": heuristic_n_routes,                  # heuristic_n_routes
        "characteristic_36": float(avg_customers_per_route),      # avg_customers_per_route
        "characteristic_37": float(capacity_utilization_estimate), # capacity_utilization_estimate
        "characteristic_38": float(estimated_total_route_distance), # estimated_total_route_distance
        "characteristic_39": float(avg_route_distance_estimate),  # avg_route_distance_estimate
        "characteristic_40": float(demand_distance_correlation),  # demand_distance_correlation
        "characteristic_41": float(avg_closeness_centrality),     # avg_closeness_centrality
        "characteristic_42": float(std_closeness_centrality),     # std_closeness_centrality
        "characteristic_43": float(avg_betweenness_centrality),   # avg_betweenness_centrality
        "characteristic_44": float(std_betweenness_centrality),   # std_betweenness_centrality
        "characteristic_45": float(distance_skewness),            # distance_skewness
        "characteristic_46": float(distance_kurtosis),            # distance_kurtosis
        "characteristic_47": float(demand_entropy),               # demand_entropy
        "characteristic_48": float(demand_coefficient_of_variation), # demand_coefficient_of_variation
        "characteristic_49": float(distance_coefficient_of_variation), # distance_coefficient_of_variation
        "characteristic_50": float(high_demand_ratio),            # high_demand_customer_ratio
    }

    # Output results via helper function
    output_results(results)


if __name__ == "__main__":
    main()
