#!/usr/bin/env python3
"""
Graph converter for pattern_set_mining problem.
Created using subagent_prompt.md version: v_02

This problem is about finding optimal itemsets in a machine learning context.
The goal is to find K itemsets (patterns) that maximize accuracy on positive examples
while minimizing false positives on negative examples.

Key challenges: Balance between coverage of positive examples and avoiding negative examples,
dealing with large transaction databases, and the combinatorial nature of itemset selection.
"""

import sys
import json
import math
import networkx as nx
from pathlib import Path


def build_graph(mzn_file, json_data):
    """
    Build graph representation of the pattern set mining problem instance.
    
    Args:
        mzn_file: Path to .mzn file (for reference)
        json_data: Dict containing parsed DZN data
    
    Strategy: Create a bipartite graph modeling the itemset mining structure
    - Items as type 0 nodes (things to potentially include in patterns)
    - Pattern slots as type 1 constraint nodes (K different patterns to find)
    - Transaction coverage constraints as type 1 nodes
    - Balance between positive and negative coverage using weights
    
    Note: Full transaction data (TDB_pos, TDB_neg) not available in JSON.
    This is a known limitation - we work with available parameters.
    """
    # Access data from json_data dict
    NrI = json_data.get('NrI', 0)  # Number of items
    NrT_pos = json_data.get('NrT_pos', 0)  # Number of positive transactions
    NrT_neg = json_data.get('NrT_neg', 0)  # Number of negative transactions
    K = json_data.get('K', 1)  # Number of patterns to find
    
    G = nx.Graph()
    
    # Item nodes (type 0) - things that can be included in patterns
    # Weight by relative importance - items involved in more potential patterns
    # are more critical. Since we don't have transaction data, use position-based weighting
    for i in range(NrI):
        # Items in middle positions tend to be more discriminative
        centrality = 1.0 - abs(i - NrI//2) / (NrI//2 + 1)
        # Add some non-linear scaling
        weight = 0.3 + 0.7 * math.sqrt(centrality)
        G.add_node(f'item_{i}', type=0, weight=weight)
    
    # Pattern slot nodes (type 1) - each pattern to be found
    # Weight by difficulty - with fewer patterns, each must be more precise
    pattern_difficulty = 1.0 - (K-1) / max(K, 10)  # Normalize relative to reasonable max
    for k in range(K):
        G.add_node(f'pattern_{k}', type=1, weight=pattern_difficulty)
    
    # Positive coverage constraint (type 1) - need to cover positive examples
    pos_pressure = NrT_pos / (NrT_pos + NrT_neg) if (NrT_pos + NrT_neg) > 0 else 0.5
    G.add_node('pos_coverage', type=1, weight=pos_pressure)
    
    # Negative avoidance constraint (type 1) - avoid covering negative examples
    neg_pressure = NrT_neg / (NrT_pos + NrT_neg) if (NrT_pos + NrT_neg) > 0 else 0.5
    G.add_node('neg_avoidance', type=1, weight=neg_pressure)
    
    # Accuracy balancing node (type 1) - represents the objective trade-off
    total_transactions = NrT_pos + NrT_neg
    if total_transactions > 0:
        imbalance = abs(NrT_pos - NrT_neg) / total_transactions
        balance_difficulty = 0.5 + 0.5 * imbalance  # More imbalanced = harder
    else:
        balance_difficulty = 0.5
    G.add_node('accuracy_balance', type=1, weight=balance_difficulty)
    
    # Connect items to pattern slots - each item can be selected for each pattern
    for i in range(NrI):
        for k in range(K):
            # Weight by selection complexity - more patterns make selection harder
            selection_weight = 0.4 + 0.6 * (K / 10.0)  # Scale with number of patterns
            G.add_edge(f'item_{i}', f'pattern_{k}', weight=min(selection_weight, 1.0))
    
    # Connect patterns to coverage constraints
    for k in range(K):
        # Pattern contributes to positive coverage
        pos_contribution = 0.7 + 0.3 * math.exp(-k * 0.1)  # Earlier patterns slightly more important
        G.add_edge(f'pattern_{k}', 'pos_coverage', weight=pos_contribution)
        
        # Pattern must avoid negative coverage
        neg_avoidance = 0.6 + 0.4 * (NrT_neg / max(NrT_pos + NrT_neg, 1))
        G.add_edge(f'pattern_{k}', 'neg_avoidance', weight=neg_avoidance)
        
        # Pattern affects accuracy balance
        balance_weight = 0.8
        G.add_edge(f'pattern_{k}', 'accuracy_balance', weight=balance_weight)
    
    # Add complexity based on problem size - larger problems are inherently harder
    if NrI > 50:  # Medium-large item set
        complexity_factor = min(NrI / 100.0, 1.0)
        G.add_node('size_complexity', type=1, weight=complexity_factor)
        
        # Connect high-weight items to size complexity
        for i in range(min(10, NrI)):  # Top items by position
            if G.nodes[f'item_{i}']['weight'] > 0.7:
                G.add_edge(f'item_{i}', 'size_complexity', weight=0.6)
    
    # Add transaction imbalance edges if significant imbalance exists
    if total_transactions > 0:
        imbalance_ratio = abs(NrT_pos - NrT_neg) / total_transactions
        if imbalance_ratio > 0.3:  # Significant imbalance
            # Create edges between coverage constraints to model tension
            G.add_edge('pos_coverage', 'neg_avoidance', weight=imbalance_ratio)
            G.add_edge('pos_coverage', 'accuracy_balance', weight=0.7)
            G.add_edge('neg_avoidance', 'accuracy_balance', weight=0.7)
    
    return G


def main():
    if len(sys.argv) != 4:
        print("Usage: python converter.py <mzn_file> <dzn_file> <json_file>")
        sys.exit(1)
    
    mzn_file = sys.argv[1]
    dzn_file = sys.argv[2]
    json_file = sys.argv[3]
    
    # Load JSON data
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    # Build graph
    G = build_graph(mzn_file, json_data)
    
    # Graph is returned by build_graph for direct feature extraction
    print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


if __name__ == "__main__":
    main()