import random
import json
from pathlib import Path

# Sample relation list (for testing)
relations = ['H < P', 'I > L', 'A < P', 'R < L', 'Y < C', 'V < Y', 'L > Y']

# Obscure/technical-sounding financial institution name pool
financial_institution_pool = [
    "AlphaCredit", "VanguardCapital", "ZenithTrust", "NorthBridge", "MetroCore",
    "QuantumBank", "CryptoVault", "StreamlineCredit", "ProtoFunds", "DataSieveBank",
    "JetCapital", "IndexTrust", "LoadStoneBank", "FluxMarshal", "YokoFinance",
    "MuxDelta", "HushProxy", "CipherLint", "BitDrain", "CorePhage",
    "JsonSpill", "ParaSync", "VergeProc", "ShardGlue", "TokenTrap",
    "SyncRift", "DepthLink", "GhostBind", "CoreStitch", "RefCast",
    "PackLane", "ShadeVault", "PipeStream", "BindScope", "MemDraft",
    "OptiVault", "WrapTide", "StackTune", "ModLinker", "NebuLock",
    "ForkTrail", "LagVizor", "TickRoll", "HookMesh", "DiskShade",
    "NodeMerge", "FuzzCrunch", "ObfusGrid", "ThreadMint", "StreamShade",
    "GraphRust", "QueueForge", "RangeSeal", "IndexMorph", "NetSealer",
    "BlobFlux", "LoopStitch", "JitWeave", "CryptoSurge", "FluxRigger",
    "SnapVeil", "AuthChute", "CodeCage", "ModProbe", "SockHub",
    "PrefixBank", "CargoWrap", "VectroBit", "LogShard", "WormTrace",
    "MuxCage", "HeapTrail", "NexusBloom", "TunnelBit", "StructFoil",
    "LinkMelt", "TraceStream", "BoundCrate", "BufferPike", "GridSnare"
]

# Financial dependency risk phrases
less_than_phrases = [
    "{left} depends on {right}.",
    "{left} is exposed to {right}.",
    "{left} relies on {right}.",
    "{left} is at risk from {right}.",
    "{left} needs {right}."
]

greater_than_phrases = [
    "{left} is a risk for {right}.",
    "{left} affects {right}.",
    "{left} exposes {right}.",
    "{left} impacts {right}.",
    "{left} can harm {right}."
]

def convert_relations(relations, nodes_label=None):
    """Convert symbolic relations to natural language descriptions."""
    # Get all symbols from relationships by parsing the left and right operands
    symbols_from_relations = set()
    for rel in relations:
        parts = rel.split()
        if len(parts) == 3:  # Expected format: "left op right"
            left, op, right = parts
            symbols_from_relations.add(left)
            symbols_from_relations.add(right)
    
    if nodes_label:
        # Use all nodes from the label to ensure we have mappings for cycles
        all_symbols = set(nodes_label)
    else:
        all_symbols = symbols_from_relations
    
    symbols = sorted(all_symbols)
    
    # Ensure we have enough unique names for all symbols
    if len(symbols) > len(financial_institution_pool):
        raise ValueError(f"Not enough unique financial institution names ({len(financial_institution_pool)}) for all symbols ({len(symbols)})")
    
    name_map = {symbol: name for symbol, name in zip(symbols, random.sample(financial_institution_pool, len(symbols)))}

    output = []
    for rel in relations:
        left, op, right = rel.split()

        # Validate that nodes exist in mapping
        if left not in name_map:
            raise KeyError(f"Node '{left}' from relation '{rel}' not found in node mapping")
        if right not in name_map:
            raise KeyError(f"Node '{right}' from relation '{rel}' not found in node mapping")

        left_name = name_map[left]
        right_name = name_map[right]

        if op == "<":
            phrase = random.choice(less_than_phrases).format(left=left_name, right=right_name)
        elif op == ">":
            phrase = random.choice(greater_than_phrases).format(left=left_name, right=right_name)
        else:
            raise ValueError(f"Unknown operator in relation: {rel}")

        output.append(phrase)

    return output, name_map

def convert_cycles(cycles, name_map):
    """Convert cycles using the same node mapping."""
    converted_cycles = []
    for cycle in cycles:
        converted_cycle = []
        for node in cycle:
            if node not in name_map:
                raise KeyError(f"Node '{node}' from cycle not found in node mapping")
            converted_cycle.append(name_map[node])
        converted_cycles.append(converted_cycle)
    return converted_cycles

def convert_comparison(comparison, name_map):
    """Convert comparison query to natural language question using the same node mapping."""
    if not comparison:
        return comparison
    
    left, op, right = comparison.split()
    
    # Validate that nodes exist in mapping
    if left not in name_map:
        raise KeyError(f"Node '{left}' from comparison not found in node mapping")
    if right not in name_map:
        raise KeyError(f"Node '{right}' from comparison not found in node mapping")
        
    left_name = name_map[left]
    right_name = name_map[right]
    
    # Convert to short natural language questions about financial dependency
    if op == "<":
        return f"Is {left_name} less risky than {right_name}?"
    elif op == ">":
        return f"Is {left_name} riskier than {right_name}?"
    else:
        raise ValueError(f"Unknown operator in comparison: {comparison}")

def process_dataset_row(row):
    """Process a single row from the dataset."""
    # Convert relationships to natural language, using all nodes to ensure complete mapping
    nl_relations, name_map = convert_relations(row['relationships'], row['nodes_label'])
    
    # Convert cycles using the same mapping
    nl_cycles = convert_cycles(row['cycles'], name_map)
    
    # Convert comparison query if it exists
    nl_comparison = convert_comparison(row.get('comparison', ''), name_map)
    
    # Convert node labels
    nl_nodes_label = []
    for node in row['nodes_label']:
        if node not in name_map:
            raise KeyError(f"Node '{node}' from nodes_label not found in node mapping")
        nl_nodes_label.append(name_map[node])
    
    # Create new row with original data for changed columns moved to origin_ prefix
    new_row = {}
    
    # Define which columns are being changed
    changed_columns = {'relationships', 'cycles', 'nodes_label', 'comparison'}
    
    # Copy all original data, adding origin_ prefix only to changed columns
    for key, value in row.items():
        if key in changed_columns:
            new_row[f'origin_{key}'] = value
        else:
            new_row[key] = value
    
    # Add converted data using original column names
    new_row.update({
        'relationships': nl_relations,
        'cycles': nl_cycles,
        'nodes_label': nl_nodes_label,
        'comparison': nl_comparison,
        'node_mapping': name_map
    })
    
    return new_row

def process_jsonl_file(input_path, output_path):
    """Process a JSONL file and output converted version."""
    input_path = Path(input_path)
    output_path = Path(output_path)
    
    if not input_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_path}")
    
    # Create output directory if it doesn't exist
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    processed_count = 0
    
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                # Parse JSON line
                row = json.loads(line.strip())
                
                # Process the row
                converted_row = process_dataset_row(row)
                
                # Write to output file
                outfile.write(json.dumps(converted_row, ensure_ascii=False) + '\n')
                processed_count += 1
                
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                continue
    
    print(f"Successfully processed {processed_count} rows from {input_path} to {output_path}")
    return processed_count

def main():
    """Main function with inline file paths."""
    # Set random seed for reproducible results
    random.seed(42)
    
    # Define input and output file paths here
    input_file = "task3.jsonl"  # Change this to your input file path
    output_file = "task3_nl_finance.jsonl"  # Change this to your desired output file path
    
    try:
        process_jsonl_file(input_file, output_file)
    except Exception as e:
        print(f"Error: {e}")
        return 1
    
    return 0

if __name__ == "__main__":
    import sys
    sys.exit(main())
