import numpy as np
import pandas as pd
import networkx as nx
from pgmpy.readwrite import BIFReader
rename_mapping_insurance = {
    "GoodStudent": "Good Student",
    "Age": "Age",
    "SocioEcon": "Socioeconomic Status",
    "RiskAversion": "Risk Aversion",
    "VehicleYear": "Vehicle Age",
    "ThisCarDam": "This-Car Damage",
    "RuggedAuto": "Car Ruggedness",
    "Accident": "Accident Severity",
    "MakeModel": "Car Model",
    "DrivQuality": "Driving Quality",
    "Mileage": "Mileage",
    "Antilock": "ABS",
    "DrivingSkill": "Driving Skill",
    "SeniorTrain": "Senior Training",
    "ThisCarCost": "This-Car Cost",
    "Theft": "Theft",
    "CarValue": "Car Value",
    "HomeBase": "Neighbourhood Type",
    "AntiTheft": "Anti-Theft System",
    "PropCost": "Property Cost Ratio",
    "OtherCarCost": "Other-Car Cost",
    "OtherCar": "Other Cars Involved",
    "MedCost": "Medical Cost",
    "Cushioning": "Cushioning",
    "Airbag": "Airbag",
    "ILiCost": "Inspection Cost",
    "DrivHist": "Driving History",
}

variable_description_insurance = {
    # DEMOGRAPHICS & PERSONAL ATTRIBUTES
    "Good Student":
        "Binary: False / True. Indicates that the primary rated driver qualifies for a ‘good-student’ discount—"
        "typically a full-time high-school or college student maintaining a B (≈3.0) GPA or better."
        "TRUE is associated with lower claim frequency and premium credits.",
    
    "Age":
        "Three-level: Adolescent, Adult, Senior. Age of the principal operator at policy inception."
        "Adolescent (< 25 yr) reflects inexperience and higher crash risk; Adult (25–64 yr) is actuarial baseline;"
        "Senior (≥ 65 yr) carries elevated risk from slower reflexes but often fewer miles driven."
        "Older policy-holders are more likely to have accumulated education, tenure, and savings.",
    
    "Socioeconomic Status":
        "Four-level: Prole, Middle, UpperMiddle, Wealthy. Proxy for household income / occupational prestige."
        "Higher status correlates with costlier cars, more liability coverage purchased, and in some studies slightly"
        "lower at-fault accident rates due to newer safety technology and ability to maintain vehicles.",
    
    "Risk Aversion":
        "Four-level: Psychopath, Adventurous, Normal, Cautious. Latent personality factor reflecting propensity"
        "for risky driving (speeding, tailgating). Psychopath denotes extreme recklessness; Cautious represents"
        "defensive drivers who brake early and avoid distractions.",
    
    # VEHICLE CHARACTERISTICS
    "Vehicle Age":
        "Two-level: Current / Older. CURRENT usually means ≤ 3 model-years old; OLDER is > 3 years."
        "New cars cost more to repair but often include modern crash-avoidance tech, lowering injury severity.",
    
    "Car Ruggedness":
        "Three-level: EggShell, Football, Tank. Describes structural toughness. EggShell = fragile sub-compact;"
        "Football = average unibody sedan; Tank = heavy body-on-frame SUV/truck with high crashworthiness.",
    
    "Car Model":
        "Five-level: SportsCar, Economy, FamilySedan, Luxury, SuperLuxury. Captures performance class and MSRP"
        "band. Sports cars show higher frequency of speed-related losses; Luxury tiers drive up physical-damage"
        "severity owing to expensive parts.",
    
    "Car Value":
        "Five-level: FiveThou, TenThou, TwentyThou, FiftyThou, Million. Current actual-cash-value (ACV) of the"
        "insured vehicle, rounded to canonical bands used by the model (≈ $5 k, $10 k, $20 k, $50 k, $1 M collector).",
    
    "This-Car Cost":
        "Four-level: Thousand, TenThou, HundredThou, Million. Original sticker price or replacement cost new, including the expected cost of insuring the vehicle."
        "Used in underwriting to set physical-damage limits and deductibles.",
    
    # DRIVER PERFORMANCE & EXPERIENCE
    "Driving Quality":
        "Three-level: Poor, Normal, Excellent. Real-world telematics or observed behavior score combining harsh-brake,"
        "speeding, and phone-use metrics. Excellent drivers earn usage-based-insurance discounts.",
    
    "Driving Skill":
        "Three-level: SubStandard, Normal, Expert. Abstract intrinsic skill level (reaction time, situational awareness)"
        "separate from day-to-day quality; Expert could represent professional/advanced defensive-driving course graduates."
        "Poor driving skill increases the probability of recent violations or accidents.",
    
    "Driving History":
        "Three-level: Zero, One, Many. Number of prior moving violations or at-fault claims in the last five policy years."
        "Heavily weighted in experience rating and renewal premiums.",
    
    "Senior Training":
        "Binary: False / True. TRUE if a driver age ≥ 55 has completed an insurer-approved mature-driver safety course,"
        "earning statutory premium reductions in many U.S. states. Taking an optional senior‐driver course is mainly the driver's choice.",
    
    "Mileage":
        "Four-level: FiveThou, TwentyThou, FiftyThou, Domino. Estimated annual mileage: ≈ 5 k, 20 k, 50 k, and ‘Domino’"
        "(> 50 k, e.g., ride-share or delivery). Higher exposure increases expected loss frequency.",
    
    # SAFETY & SECURITY FEATURES
    "ABS":
        "Binary: False / True. Presence of anti-lock braking system. TRUE reduces loss severity by shortening stop"
        "distance and maintaining steering control.",
    
    "Airbag":
        "Binary: False / True. Indicates front-airbag (and, implicitly, side-curtain) installation. TRUE lowers"
        "injury claims and medical payment severities.",
    
    "Cushioning":
        "Four-level: Poor, Fair, Good, Excellent. Composite rating of seat-belt fit, head-rest geometry, and interior"
        "padding. Excellent cushioning mitigates whiplash and soft-tissue injury.",
    
    "Anti-Theft System":
        "Binary: False / True. TRUE if the vehicle has factory or aftermarket immobilizer, alarm, or GPS tracking."
        "Lowers comprehensive (theft) claim frequency and sometimes liability if joy-rides avoided.",
    
    # INCIDENT / LOSS DETAILS
    "This-Car Damage":
        "Four-level: None, Mild, Moderate, Severe. Physical condition of the insured vehicle post-accident as appraised"
        "by adjusters. Guides repair cost estimation and salvage decision.",
    
    "Accident Severity":
        "Four-level: None, Mild, Moderate, Severe. Overall crash energy and resulting bodily/property damage."
        "Feeds into probability of high medical or liability payouts.",
    
    "Theft":
        "Binary: False / True. TRUE means the loss is primarily a theft or attempted theft rather than a collision.",
    
    "Other Cars Involved":
        "Binary: False / True. MULTI-vehicle indicator. TRUE increases the likelihood of liability payouts and"
        "property-damage clustering (domino effect).",
    
    "Other-Car Cost":
        "Four-level: Thousand, TenThou, HundredThou, Million. Estimated repair or total-loss settlement for third-party"
        "vehicle(s) damaged in the same event.",
    
    "Medical Cost":
        "Four-level: Thousand, TenThou, HundredThou, Million. Aggregated bodily-injury, personal-injury-protection, or"
        "medical-payments costs for occupants and third parties.",
    
    "Property Cost Ratio":
        "Four-level: Thousand, TenThou, HundredThou, Million. Total non-vehicle property damage (e.g., fences, lamp-posts,"
        "buildings) expressed in monetary bands parallel to ‘Other-Car Cost’.",
    
    "Inspection Cost":
        "Four-level: Thousand, TenThou, HundredThou, Million. Sum of adjuster time, independent appraisals, forensic"
        "analysis, and any special investigation triggered by the claim.",
    
    # ENVIRONMENT
    "Neighbourhood Type":
        "Four-level: Secure, City, Suburb, Rural. Location where the vehicle is primarily garaged or driven."
        "Secure = gated / guarded community with CCTV; City implies high-density urban core; Suburb mixes residential"
        "streets and arterials; Rural has sparse traffic but higher severe-outcome crashes due to speed limits."
}

value_mappings_insurance = {
    # ── binary False / True ───────────────────────────────────────────────
    "Good Student":            {False: 0, True: 1, "False": 0, "True": 1},
    "ABS":                     {False: 0, True: 1, "False": 0, "True": 1},
    "Senior Training":         {False: 0, True: 1, "False": 0, "True": 1},
    "Theft":                   {False: 0, True: 1, "False": 0, "True": 1},
    "Anti-Theft System":       {False: 0, True: 1, "False": 0, "True": 1},
    "Other Cars Involved":     {False: 0, True: 1, "False": 0, "True": 1},
    "Airbag":                  {False: 0, True: 1, "False": 0, "True": 1},

    # ── Age (Adolescent / Adult / Senior) ────────────────────────────────
    "Age": {
        "Adolescent": 0,
        "Adult":      1,
        "Senior":     2,
    },

    # ── Socioeconomic Status 4-level ─────────────────────────────────────
    "Socioeconomic Status": {
        "Prole":        0,
        "Middle":       1,
        "UpperMiddle":  2,
        "Wealthy":      3,
    },

    # ── Risk Aversion 4-level ────────────────────────────────────────────
    "Risk Aversion": {
        "Psychopath":  0,
        "Adventurous": 1,
        "Normal":      2,
        "Cautious":    3,
    },

    # ── Vehicle Age (Current / Older) ────────────────────────────────────
    "Vehicle Age": {
        "Current": 0,
        "Older":   1,
    },

    # ── Damage, Accident severity etc. 4-level None/Mild/Moderate/Severe ─
    "This-Car Damage": {
        "None":     0,
        "Mild":     1,
        "Moderate": 2,
        "Severe":   3,
    },
    "Accident Severity": {
        "None":     0,
        "Mild":     1,
        "Moderate": 2,
        "Severe":   3,
    },

    # ── Car Ruggedness 3-level ───────────────────────────────────────────
    "Car Ruggedness": {
        "EggShell": 0,
        "Football": 1,
        "Tank":     2,
    },

    # ── Car Model 5-level ────────────────────────────────────────────────
    "Car Model": {
        "SportsCar":    0,
        "Economy":      1,
        "FamilySedan":  2,
        "Luxury":       3,
        "SuperLuxury":  4,
    },

    # ── Driving Quality 3-level ──────────────────────────────────────────
    "Driving Quality": {
        "Poor":     0,
        "Normal":   1,
        "Excellent":2,
    },

    # ── Mileage 4-level ─────────────────────────────────────────────────
    "Mileage": {
        "FiveThou":   0,
        "TwentyThou": 1,
        "FiftyThou":  2,
        "Domino":     3,
    },

    # ── Driving Skill 3-level ───────────────────────────────────────────
    "Driving Skill": {
        "SubStandard": 0,
        "Normal":      1,
        "Expert":      2,
    },

    # ── 4-level monetary Thousand / TenThou / HundredThou / Million ─────
    "This-Car Cost": {
        "Thousand":     0,
        "TenThou":      1,
        "HundredThou":  2,
        "Million":      3,
    },
    "Property Cost Ratio": {
        "Thousand":     0,
        "TenThou":      1,
        "HundredThou":  2,
        "Million":      3,
    },
    "Other-Car Cost": {
        "Thousand":     0,
        "TenThou":      1,
        "HundredThou":  2,
        "Million":      3,
    },
    "Medical Cost": {
        "Thousand":     0,
        "TenThou":      1,
        "HundredThou":  2,
        "Million":      3,
    },
    "Inspection Cost": {
        "Thousand":     0,
        "TenThou":      1,
        "HundredThou":  2,
        "Million":      3,
    },

    # ── Car Value 5-level ───────────────────────────────────────────────
    "Car Value": {
        "FiveThou":   0,
        "TenThou":    1,
        "TwentyThou": 2,
        "FiftyThou":  3,
        "Million":    4,
    },

    # ── Neighbourhood Type 4-level ──────────────────────────────────────
    "Neighbourhood Type": {
        "Secure": 0,
        "City":   1,
        "Suburb": 2,
        "Rural":  3,
    },

    # ── Cushioning 4-level ──────────────────────────────────────────────
    "Cushioning": {
        "Poor":      0,
        "Fair":      1,
        "Good":      2,
        "Excellent": 3,
    },

    # ── Driving History 3-level ─────────────────────────────────────────
    "Driving History": {
        "Zero": 0,
        "One":  1,
        "Many": 2,
    },
}

dataset_description_insurance = "The Insurance dataset for evaluating car insurance risks and estimating the expected claim costs for a car insurance policyholder. Of note, it was elicited from actuaries, not purely from epidemiological or economic causality. Arrows were drawn wherever the experts felt more comfortable providing the conditional probabilities in that direction."

def fetch_insurance():
    df = pd.read_csv(f'/net/dali/home/mscbio/rul98/CausalLLM/data/insurance.csv')

    # Normalize all missing values (including string "<NA>")
    df = df.replace(["<NA>", "nan", pd.NA], "None")

    df = df.rename(columns=rename_mapping_insurance)

    for col, mapping in value_mappings_insurance.items():
        if col in df.columns:
            df[col] = df[col].astype("str").map(mapping).astype("int")

    reader = BIFReader(f'/net/dali/home/mscbio/rul98/CausalLLM/data/insurance.bif')
    G_model = reader.get_model()

    # Create a directed graph from the edges
    GroundTruth = nx.DiGraph()
    GroundTruth.add_nodes_from(G_model.nodes())
    GroundTruth.add_edges_from(G_model.edges())
    GroundTruth = nx.relabel_nodes(GroundTruth, rename_mapping_insurance)
    pos_data = nx.spring_layout(GroundTruth)
    # print(set(GroundTruth.nodes()) - set(df.columns), set(df.columns) - set(GroundTruth.nodes()))
    return df, GroundTruth, pos_data