#!/usr/bin/env python3
"""
Graph converter for instruction selection problem.
Created using subagent_prompt.md version: v_02

This problem is about code generation and instruction selection for compilers.
It involves selecting instruction patterns (matches) to cover operations while
respecting data flow, block placement, and register allocation constraints.

Key challenges: 
- Complex interdependencies between instruction selection, block placement, and register allocation
- Dominance relationships between blocks
- Location constraints for data placement
- Multiple conflicting objectives (latency, code size)
"""

import sys
import json
import math
import networkx as nx
from pathlib import Path


def build_graph(mzn_file, json_data):
    """
    Build graph representation of the instruction selection problem instance.
    
    Args:
        mzn_file: Path to .mzn file (for reference)
        json_data: Dict containing parsed DZN data
    
    Strategy: Create a multi-layered bipartite graph capturing:
    - Operations (type 0) that need instruction coverage
    - Data elements (type 0) that need placement and flow
    - Matches/Instructions (type 1) that cover operations and define/use data  
    - Blocks (type 2) that structure control flow and dominance
    - Location constraints (type 1) that limit data placement
    """
    
    # Extract key problem dimensions
    num_operations = json_data.get('numOperationsInFunction', 0)
    num_data = json_data.get('numDataInFunction', 0)
    num_blocks = json_data.get('numBlocksInFunction', 0)
    num_matches = json_data.get('numMatches', 0)
    num_locations = json_data.get('numLocations', 0)
    entry_block = json_data.get('entryBlockOfFunction', 0)
    
    # Get constraint arrays
    non_copy_matches = set(json_data.get('nonCopyMatches', []))
    in_block = json_data.get('inBlock', [])
    in_block_succ = json_data.get('inBlockSucc', [])
    same_loc = json_data.get('sameLoc', [])
    loc_domain = json_data.get('locDomain', [])
    
    G = nx.Graph()
    
    # Add operation nodes (type 0) - decision variables that need coverage
    for op in range(num_operations):
        # Operations closer to the entry are often more critical
        criticality = 1.0 - (op / max(num_operations, 1))
        G.add_node(f'op_{op}', type=0, weight=0.3 + 0.7 * criticality)
    
    # Add data nodes (type 0) - values that flow through the program
    for data in range(num_data):
        # Data elements have varying importance based on usage patterns
        # Use exponential decay to emphasize early data elements (often more critical)
        importance = math.exp(-2.0 * data / max(num_data, 1))
        G.add_node(f'data_{data}', type=0, weight=0.2 + 0.6 * importance)
    
    # Add block nodes (type 2) - control flow structure  
    for block in range(num_blocks):
        # Entry block and blocks near entry are more critical
        if block == entry_block:
            centrality = 1.0
        else:
            # Blocks further from entry block are less critical
            distance = abs(block - entry_block)
            centrality = math.exp(-0.5 * distance / max(num_blocks, 1))
        G.add_node(f'block_{block}', type=2, weight=0.4 + 0.6 * centrality)
    
    # Add match nodes (type 1) - instruction patterns that can be selected
    for match in range(num_matches):
        # Non-copy matches are typically more important than copy matches
        if match in non_copy_matches:
            # Important instruction patterns
            base_weight = 0.8
        else:
            # Copy operations are simpler
            base_weight = 0.3
        
        # Add some variation based on match index (later matches often more specialized)
        specialization = 1.0 - math.exp(-match / max(num_matches, 10))
        weight = base_weight + 0.2 * specialization
        G.add_node(f'match_{match}', type=1, weight=min(weight, 1.0))
    
    # Add constraint nodes for complex relationships
    
    # Block succession constraints (type 1)
    if in_block_succ:
        for i in range(0, len(in_block_succ), 3):
            if i + 2 < len(in_block_succ):
                match_id = in_block_succ[i]
                from_block = in_block_succ[i + 1] 
                to_block = in_block_succ[i + 2]
                
                constraint_id = f'block_succ_{match_id}_{from_block}_{to_block}'
                # Control flow constraints are critical
                G.add_node(constraint_id, type=1, weight=0.9)
                
                # Connect to relevant entities
                if match_id < num_matches:
                    G.add_edge(f'match_{match_id}', constraint_id, weight=0.8)
                if from_block < num_blocks:
                    G.add_edge(f'block_{from_block}', constraint_id, weight=0.7)
                if to_block < num_blocks:
                    G.add_edge(f'block_{to_block}', constraint_id, weight=0.7)
    
    # Block placement constraints (type 1)  
    if in_block:
        for i in range(0, len(in_block), 2):
            if i + 1 < len(in_block):
                match_id = in_block[i]
                block_id = in_block[i + 1]
                
                constraint_id = f'in_block_{match_id}_{block_id}'
                # Placement constraints have medium importance
                G.add_node(constraint_id, type=1, weight=0.6)
                
                if match_id < num_matches:
                    G.add_edge(f'match_{match_id}', constraint_id, weight=0.7)
                if block_id < num_blocks:
                    G.add_edge(f'block_{block_id}', constraint_id, weight=0.6)
    
    # Same location constraints (type 1)
    if same_loc:
        for i in range(0, len(same_loc), 3):
            if i + 2 < len(same_loc):
                match_id = same_loc[i]
                data1 = same_loc[i + 1]
                data2 = same_loc[i + 2]
                
                constraint_id = f'same_loc_{match_id}_{data1}_{data2}'
                # Location constraints can be tight and important
                G.add_node(constraint_id, type=1, weight=0.7)
                
                if match_id < num_matches:
                    G.add_edge(f'match_{match_id}', constraint_id, weight=0.6)
                if data1 < num_data:
                    G.add_edge(f'data_{data1}', constraint_id, weight=0.8)
                if data2 < num_data:
                    G.add_edge(f'data_{data2}', constraint_id, weight=0.8)
    
    # Location domain constraints (type 1)
    if loc_domain:
        for i in range(0, len(loc_domain), 4):
            if i + 3 < len(loc_domain):
                match_id = loc_domain[i]
                data_id = loc_domain[i + 1] 
                min_loc = loc_domain[i + 2]
                max_loc = loc_domain[i + 3]
                
                # Calculate constraint tightness - handle edge cases properly
                if max_loc >= min_loc:
                    domain_size = max_loc - min_loc + 1
                else:
                    domain_size = 1  # Invalid range, treat as singleton
                    
                # Normalize tightness to [0,1]
                if num_locations > 0:
                    tightness = max(0.0, min(1.0, 1.0 - (domain_size / num_locations)))
                else:
                    tightness = 0.5
                
                constraint_id = f'loc_domain_{match_id}_{data_id}_{min_loc}_{max_loc}'
                # Weight by constraint tightness (ensure [0,1])
                node_weight = 0.4 + 0.4 * tightness  # Max = 0.8
                G.add_node(constraint_id, type=1, weight=node_weight)
                
                if match_id < num_matches:
                    edge_weight = 0.5 + 0.2 * tightness  # Max = 0.7
                    G.add_edge(f'match_{match_id}', constraint_id, weight=edge_weight)
                if data_id < num_data:
                    edge_weight = 0.6 + 0.2 * tightness   # Max = 0.8
                    G.add_edge(f'data_{data_id}', constraint_id, weight=edge_weight)
    
    # Add global complexity node representing overall problem difficulty
    total_entities = num_operations + num_data + num_matches + num_blocks
    complexity_factor = math.log(1 + total_entities) / 10.0  # Logarithmic scaling
    G.add_node('global_complexity', type=1, weight=min(0.5 + complexity_factor, 1.0))
    
    # Connect global complexity to key entities with weights reflecting their contribution
    # Connect to operations (coverage requirements)
    for op in range(min(num_operations, 10)):  # Connect to first 10 operations
        weight = 0.3 + 0.4 * (1.0 - op / max(num_operations, 1))
        G.add_edge(f'op_{op}', 'global_complexity', weight=weight)
    
    # Connect to critical matches (non-copy matches)
    connected_matches = 0
    for match in non_copy_matches:
        if connected_matches >= 15:  # Limit connections
            break
        if match < num_matches:
            weight = 0.4 + 0.3 * (connected_matches / 15.0)
            G.add_edge(f'match_{match}', 'global_complexity', weight=weight)
            connected_matches += 1
    
    # Connect to entry block and nearby blocks
    for block in range(min(num_blocks, 5)):
        distance = abs(block - entry_block) 
        weight = 0.5 * math.exp(-0.3 * distance)
        if weight > 0.1:
            G.add_edge(f'block_{block}', 'global_complexity', weight=weight)
    
    return G


def main():
    if len(sys.argv) != 4:
        print("Usage: python converter.py <mzn_file> <dzn_file> <json_file>")
        sys.exit(1)
    
    mzn_file = sys.argv[1]
    dzn_file = sys.argv[2] 
    json_file = sys.argv[3]
    
    # Load JSON data
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    # Build graph
    G = build_graph(mzn_file, json_data)
    
    # Graph is returned by build_graph for direct feature extraction
    print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


if __name__ == "__main__":
    main()