#!/usr/bin/env python3
"""
Graph converter for median_string_dp problem.
Created using subagent_prompt.md version: v_02

This problem is about finding a median string that minimizes the sum of edit distances
to a collection of input strings. Key challenges: the median string must balance 
similarity to all input strings, and edit distance computation involves complex 
dynamic programming constraints.

Graph strategy:
- Position nodes (Type 0): Each position in the median string and input strings
- String nodes (Type 2): Resource-like nodes representing each input string
- Edit distance constraint nodes (Type 1): One per input string, modeling the LCS/edit distance constraint
- Character constraint nodes (Type 1): Constraints on character positions and median length
"""

import sys
import json
import math
import networkx as nx
from pathlib import Path


def build_graph(mzn_file, json_data):
    """
    Build graph representation of the median string problem instance.
    
    Args:
        mzn_file: Path to .mzn file (for reference)
        json_data: Dict containing parsed DZN data
    
    Strategy: Model the interaction between string positions, characters, and edit distance constraints.
    - Position nodes for median and string positions with character diversity weights
    - String resource nodes weighted by length and uniqueness  
    - Edit distance constraint nodes weighted by expected difficulty
    - Character position constraint nodes for median length bounds
    """
    # Access data from json_data
    num_strings = json_data.get('num_strings', 0)
    max_length_strings = json_data.get('max_length_strings', 0)
    max_length_median = json_data.get('max_length_median', 0)
    max_char = json_data.get('max_char', 1)
    str_length = json_data.get('str_length', [])
    strings_flat = json_data.get('strings', [])
    
    # Convert flattened strings array back to 2D
    strings = []
    for i in range(num_strings):
        start_idx = i * max_length_strings
        end_idx = start_idx + max_length_strings
        strings.append(strings_flat[start_idx:end_idx])
    
    G = nx.Graph()
    
    # Calculate character distribution statistics for weighting
    all_chars = [c for row in strings for c in row if c > 0]
    char_counts = {}
    for c in all_chars:
        char_counts[c] = char_counts.get(c, 0) + 1
    total_chars = len(all_chars)
    
    # 1. Median position nodes (Type 0) - decision variables
    for pos in range(max_length_median):
        # Weight by position criticality - early positions more constrained
        position_weight = 1.0 - (pos / max_length_median) * 0.5
        G.add_node(f'median_pos_{pos}', type=0, weight=position_weight)
    
    # 2. String position nodes (Type 0) - for modeling string structure
    for s in range(num_strings):
        actual_length = str_length[s] if s < len(str_length) else max_length_strings
        for pos in range(actual_length):
            char_val = strings[s][pos] if pos < len(strings[s]) else 0
            if char_val > 0:
                # Weight by character rarity (rare characters are more constraining)
                char_rarity = 1.0 - (char_counts.get(char_val, 1) / total_chars)
                G.add_node(f'string_{s}_pos_{pos}', type=0, weight=char_rarity)
    
    # 3. String resource nodes (Type 2) - represent each input string
    avg_length = sum(str_length) / len(str_length) if str_length else 1
    for s in range(num_strings):
        actual_length = str_length[s] if s < len(str_length) else max_length_strings
        # Weight by length deviation from average (outliers are harder to satisfy)
        length_deviation = abs(actual_length - avg_length) / max(avg_length, 1)
        string_weight = 0.5 + min(length_deviation, 0.5)
        G.add_node(f'string_{s}', type=2, weight=string_weight)
    
    # 4. Edit distance constraint nodes (Type 1) - one per string
    for s in range(num_strings):
        actual_length = str_length[s] if s < len(str_length) else max_length_strings
        # Weight by expected constraint difficulty
        # Longer strings and more diverse character sets are harder to align
        char_diversity = len(set(strings[s][:actual_length])) / max(max_char, 1)
        length_factor = actual_length / max_length_strings
        difficulty = (char_diversity + length_factor) / 2.0
        G.add_node(f'edit_dist_constraint_{s}', type=1, weight=difficulty)
    
    # 5. Character position constraints (Type 1) - for median length bounds
    length_tightness = max_length_median / max_length_strings
    G.add_node('median_length_constraint', type=1, weight=length_tightness)
    
    # 6. Character value constraints (Type 1) - for character domain
    char_domain_complexity = math.log(max_char + 1) / math.log(5)  # normalized log complexity
    G.add_node('char_domain_constraint', type=1, weight=char_domain_complexity)
    
    # Add edges for relationships
    
    # Connect median positions to edit distance constraints (participation)
    for pos in range(max_length_median):
        for s in range(num_strings):
            # Edge weight based on position importance in edit distance calculation
            pos_importance = 1.0 - (pos / max_length_median) * 0.3
            G.add_edge(f'median_pos_{pos}', f'edit_dist_constraint_{s}', 
                      weight=pos_importance)
    
    # Connect string positions to their edit distance constraints
    for s in range(num_strings):
        actual_length = str_length[s] if s < len(str_length) else max_length_strings
        for pos in range(actual_length):
            char_val = strings[s][pos] if pos < len(strings[s]) else 0
            if char_val > 0:
                # Weight by character frequency (common chars create stronger connections)
                char_freq = char_counts.get(char_val, 1) / total_chars
                G.add_edge(f'string_{s}_pos_{pos}', f'edit_dist_constraint_{s}',
                          weight=char_freq)
    
    # Connect string positions to their string resource nodes
    for s in range(num_strings):
        actual_length = str_length[s] if s < len(str_length) else max_length_strings
        for pos in range(actual_length):
            char_val = strings[s][pos] if pos < len(strings[s]) else 0
            if char_val > 0:
                # Weight by position in string (early positions more important)
                pos_weight = 1.0 - (pos / actual_length) * 0.4
                G.add_edge(f'string_{s}_pos_{pos}', f'string_{s}', weight=pos_weight)
    
    # Connect median positions to length and domain constraints
    for pos in range(max_length_median):
        # Length constraint connection - stronger for later positions
        length_weight = 0.3 + (pos / max_length_median) * 0.7
        G.add_edge(f'median_pos_{pos}', 'median_length_constraint', weight=length_weight)
        
        # Domain constraint connection - uniform weight
        G.add_edge(f'median_pos_{pos}', 'char_domain_constraint', weight=0.8)
    
    # Add conflict edges between strings with very different lengths or character distributions
    for s1 in range(num_strings):
        for s2 in range(s1 + 1, num_strings):
            len1 = str_length[s1] if s1 < len(str_length) else max_length_strings
            len2 = str_length[s2] if s2 < len(str_length) else max_length_strings
            
            # Check length difference
            length_diff = abs(len1 - len2) / max(max_length_strings, 1)
            
            # Check character set overlap
            chars1 = set(strings[s1][:len1]) - {0}
            chars2 = set(strings[s2][:len2]) - {0}
            if len(chars1) > 0 and len(chars2) > 0:
                overlap = len(chars1 & chars2) / len(chars1 | chars2)
                char_conflict = 1.0 - overlap
            else:
                char_conflict = 0.5
            
            # Add conflict edge if strings are very different
            conflict_strength = (length_diff + char_conflict) / 2.0
            if conflict_strength > 0.6:  # Only add significant conflicts
                G.add_edge(f'string_{s1}', f'string_{s2}', weight=conflict_strength)
    
    return G


def main():
    if len(sys.argv) != 4:
        print("Usage: python converter.py <mzn_file> <dzn_file> <json_file>")
        sys.exit(1)
    
    mzn_file = sys.argv[1]
    dzn_file = sys.argv[2]
    json_file = sys.argv[3]
    
    # Load JSON data
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    # Build graph
    G = build_graph(mzn_file, json_data)
    
    # Graph is returned by build_graph for direct feature extraction
    print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


if __name__ == "__main__":
    main()