#!/usr/bin/env python3
"""
Graph converter for Japanese character encoding detection problem.
Converter created with subagent_prompt.md v_02

This problem is about recovering original character encodings from mixed byte streams.
Key challenges: Multiple overlapping encoding schemes, byte sequence dependencies, optimization objective with penalties.
"""

import sys
import json
import math
import networkx as nx
from pathlib import Path


def build_graph(mzn_file, json_data):
    """
    Build graph representation of the Japanese encoding detection problem.
    
    Args:
        mzn_file: Path to .mzn file (for reference)
        json_data: Dict containing parsed DZN data
    
    Strategy: Model byte positions as variables, create constraints for encoding rules
    - Byte positions are decision variables (type 0)
    - Encoding constraints are explicit constraint nodes (type 1)
    - Sequential dependencies between bytes
    - Objective penalties for unknown encodings
    """
    # Access data directly from json_data dict
    length = json_data.get('len', 0)
    stream = json_data.get('stream', [])
    
    if length == 0 or not stream:
        # Create minimal graph for empty case
        G = nx.Graph()
        G.add_node('empty', type=0, weight=0.5)
        return G
    
    # Create graph
    G = nx.Graph()
    
    # Add byte position nodes (type 0 - variables)
    # Weight by encoding difficulty - bytes requiring multi-byte sequences are harder
    for i in range(length):
        byte_val = stream[i] if i < len(stream) else 0
        
        # Difficulty based on byte value and position
        difficulty = 0.5  # base difficulty
        
        # ASCII bytes (0-127) are easier
        if byte_val < 128:
            difficulty = 0.3
        # High bytes (>= 128) require encoding detection
        else:
            difficulty = 0.8
            # UTF-8 start bytes are more complex
            if 224 <= byte_val <= 239:  # 3-byte UTF-8 start
                difficulty = 0.9
            elif 240 <= byte_val <= 247:  # 4-byte UTF-8 start
                difficulty = 1.0
        
        G.add_node(f'byte_{i}', type=0, weight=difficulty)
    
    # Add encoding constraint nodes (type 1)
    # Each encoding type has different complexity
    encoding_weights = {
        'ascii': 0.2,      # simplest
        'euc_jp': 0.6,     # moderate complexity
        'sjis': 0.7,       # more complex
        'utf8_2': 0.5,     # 2-byte sequences
        'utf8_3': 0.8,     # 3-byte sequences
        'utf8_4': 1.0,     # 4-byte sequences (most complex)
        'unknown': 0.9     # penalty for unknown
    }
    
    for enc_type, weight in encoding_weights.items():
        G.add_node(f'constraint_{enc_type}', type=1, weight=weight)
    
    # Add byte sequence dependency constraints
    # Multi-byte sequences have higher constraint weights
    seq_constraint_id = 0
    for i in range(length - 1):
        byte_val = stream[i] if i < len(stream) else 0
        next_val = stream[i + 1] if i + 1 < len(stream) else 0
        
        # Constraint weight based on sequence complexity
        if byte_val < 128:  # ASCII
            seq_weight = 0.3
        elif 194 <= byte_val <= 223:  # UTF-8 2-byte start
            seq_weight = 0.6
        elif 224 <= byte_val <= 239:  # UTF-8 3-byte start
            seq_weight = 0.8
        elif 240 <= byte_val <= 247:  # UTF-8 4-byte start
            seq_weight = 1.0
        else:
            seq_weight = 0.7  # EUC-JP/SJIS sequences
        
        constraint_name = f'seq_constraint_{seq_constraint_id}'
        G.add_node(constraint_name, type=1, weight=seq_weight)
        seq_constraint_id += 1
        
        # Connect bytes to sequence constraint
        G.add_edge(f'byte_{i}', constraint_name, weight=0.8)
        G.add_edge(f'byte_{i+1}', constraint_name, weight=0.8)
    
    # Connect bytes to encoding constraints based on valid ranges
    for i in range(length):
        byte_val = stream[i] if i < len(stream) else 0
        byte_node = f'byte_{i}'
        
        # ASCII constraint (0-127)
        if byte_val < 128:
            G.add_edge(byte_node, 'constraint_ascii', weight=0.9)
        
        # EUC-JP ranges
        if ((161 <= byte_val <= 168) or byte_val == 173 or 
            (176 <= byte_val <= 244) or (249 <= byte_val <= 252)):
            G.add_edge(byte_node, 'constraint_euc_jp', weight=0.7)
        
        # SJIS ranges
        if (161 <= byte_val <= 223) or (129 <= byte_val <= 159) or (224 <= byte_val <= 252):
            G.add_edge(byte_node, 'constraint_sjis', weight=0.7)
        
        # UTF-8 ranges
        if 194 <= byte_val <= 223:  # 2-byte start
            G.add_edge(byte_node, 'constraint_utf8_2', weight=0.6)
        elif 224 <= byte_val <= 239:  # 3-byte start
            G.add_edge(byte_node, 'constraint_utf8_3', weight=0.8)
        elif 240 <= byte_val <= 247:  # 4-byte start
            G.add_edge(byte_node, 'constraint_utf8_4', weight=1.0)
        elif 128 <= byte_val <= 191:  # continuation byte
            # Connect to all UTF-8 constraints with lower weight
            for utf_type in ['utf8_2', 'utf8_3', 'utf8_4']:
                G.add_edge(byte_node, f'constraint_{utf_type}', weight=0.4)
        
        # Unknown encoding - all bytes can potentially be unknown
        # Weight based on how "unusual" the byte value is
        unknown_weight = min(0.9, 0.3 + (byte_val / 255.0) * 0.6)
        G.add_edge(byte_node, 'constraint_unknown', weight=unknown_weight)
    
    # Add objective penalty structure
    # Create global objective constraint node
    G.add_node('objective_constraint', type=1, weight=1.0)
    
    # Connect all bytes to objective with weights based on encoding scores
    # Simulate the scoring system from the MZN model
    for i in range(length):
        byte_val = stream[i] if i < len(stream) else 0
        # Normalize penalty weight (higher byte values typically have higher penalties)
        penalty_weight = min(1.0, 0.4 + (byte_val / 255.0) * 0.4)
        G.add_edge(f'byte_{i}', 'objective_constraint', weight=penalty_weight)
    
    # Add conflict edges for incompatible encodings on same byte
    # Only for bytes that could belong to multiple encodings
    for i in range(length):
        byte_val = stream[i] if i < len(stream) else 0
        if byte_val >= 128:  # Non-ASCII bytes may have conflicts
            # Add conflict structure through a conflict constraint node
            conflict_node = f'conflict_{i}'
            G.add_node(conflict_node, type=1, weight=0.8)
            G.add_edge(f'byte_{i}', conflict_node, weight=0.6)
            
            # Connect to encoding constraints that compete for this byte
            competing_encodings = []
            if ((161 <= byte_val <= 168) or byte_val == 173 or 
                (176 <= byte_val <= 244) or (249 <= byte_val <= 252)):
                competing_encodings.append('constraint_euc_jp')
            if (161 <= byte_val <= 223) or (129 <= byte_val <= 159) or (224 <= byte_val <= 252):
                competing_encodings.append('constraint_sjis')
            if byte_val >= 194:
                competing_encodings.extend(['constraint_utf8_2', 'constraint_utf8_3', 'constraint_utf8_4'])
            
            # Connect conflict node to competing encodings
            for enc in competing_encodings:
                if G.has_node(enc):
                    G.add_edge(conflict_node, enc, weight=0.5)
    
    return G


def main():
    if len(sys.argv) != 4:
        print("Usage: python converter.py <mzn_file> <dzn_file> <json_file>")
        sys.exit(1)
    
    mzn_file = sys.argv[1]
    dzn_file = sys.argv[2]
    json_file = sys.argv[3]
    
    # Load JSON data
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    # Build graph
    G = build_graph(mzn_file, json_data)
    
    # Graph is returned by build_graph for direct feature extraction
    print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


if __name__ == "__main__":
    main()