import numpy as np
import pandas as pd
import networkx as nx
from pgmpy.readwrite import BIFReader
rename_mapping_water = {
    'C_NI_12_00': 'Inorganic Nitrogen Ion Count at 12:00',
    'C_NI_12_15': 'Inorganic Nitrogen Ion Count at 12:15',
    'C_NI_12_30': 'Inorganic Nitrogen Ion Count at 12:30',
    'C_NI_12_45': 'Inorganic Nitrogen Ion Count at 12:45',
    'CKNI_12_00': 'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:00',
    'CKNI_12_15': 'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:15',
    'CKNI_12_30': 'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:30',
    'CKNI_12_45': 'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:45',
    'CBODD_12_00': 'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:00',
    'CBODD_12_15': 'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:15',
    'CBODD_12_30': 'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:30',
    'CBODD_12_45': 'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:45',
    'CKND_12_00': 'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:00',
    'CKND_12_15': 'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:15',
    'CKND_12_30': 'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:30',
    'CKND_12_45': 'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:45',
    'CNOD_12_00': 'Nitrogenous Oxygen Demand (mg/L) at 12:00',
    'CNOD_12_15': 'Nitrogenous Oxygen Demand (mg/L) at 12:15',
    'CNOD_12_30': 'Nitrogenous Oxygen Demand (mg/L) at 12:30',
    'CNOD_12_45': 'Nitrogenous Oxygen Demand (mg/L) at 12:45',
    'CBODN_12_00': 'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:00',
    'CBODN_12_15': 'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:15',
    'CBODN_12_30': 'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:30',
    'CBODN_12_45': 'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:45',
    'CKNN_12_00': 'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:00',
    'CKNN_12_15': 'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:15',
    'CKNN_12_30': 'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:30',
    'CKNN_12_45': 'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:45',
    'CNON_12_00': 'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:00',
    'CNON_12_15': 'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:15',
    'CNON_12_30': 'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:30',
    'CNON_12_45': 'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:45',
}

ils_desc = {
    "C_NI_12_00": "This variable represents the number of nitrogen inorganic ions at the time of 12:00. The values are discrete (3, 4, 5, and 6), and could possibly represent the concentration levels of nitrogen inorganic ions. This variable only influces the variables in the next time step.",
    "C_NI_12_15": "This variable represents the number of nitrogen inorganic ions at the time of 12:15. The values are discrete (3, 4, 5, and 6), and could possibly represent the concentration levels of nitrogen inorganic ions. This variable only influces the variables in the next time step.",
    "C_NI_12_30": "This variable represents the number of nitrogen inorganic ions at the time of 12:30. The values are discrete (3, 4, 5, and 6), and could possibly represent the concentration levels of nitrogen inorganic ions. This variable only influces the variables in the next time step.",
    "C_NI_12_45": "This variable represents the number of nitrogen inorganic ions at the time of 12:45. The values are discrete (3, 4, 5, and 6), and could possibly represent the concentration levels of nitrogen inorganic ions. This variable only influces the variables in the next time step.",
    "CKNI_12_00": "This variable represents the concentration of Kjeldahl nitrogen inorganic (CKNI) ions at the time of 12:00. The values are discrete (20_MG_L, 30_MG_L, and 40_MG_L), indicating concentrations of 20, 30, and 40 mg/L. This variable only influces the variables in the next time step.",
    "CKNI_12_15": "This variable represents the concentration of Kjeldahl nitrogen inorganic (CKNI) ions at the time of 12:15. The values are discrete (20_MG_L, 30_MG_L, and 40_MG_L), indicating concentrations of 20, 30, and 40 mg/L. This variable only influces the variables in the next time step.",
    "CKNI_12_30": "This variable represents the concentration of Kjeldahl nitrogen inorganic (CKNI) ions at the time of 12:30. The values are discrete (20_MG_L, 30_MG_L, and 40_MG_L), indicating concentrations of 20, 30, and 40 mg/L. This variable only influces the variables in the next time step.",
    "CKNI_12_45": "This variable represents the concentration of Kjeldahl nitrogen inorganic (CKNI) ions at the time of 12:45. The values are discrete (20_MG_L, 30_MG_L, and 40_MG_L), indicating concentrations of 20, 30, and 40 mg/L. This variable only influces the variables in the next time step.",
    "CBODD_12_00": "This variable represents the carbonaceous biochemical oxygen demand (CBOD) for degradable organic matter at the time of 12:00. The values are discrete (15_MG_L, 20_MG_L, 25_MG_L, and 30_MG_L), indicating concentrations of 15, 20, 25, and 30 mg/L. This variable only influces the variables in the next time step.",
    "CBODD_12_15": "This variable represents the carbonaceous biochemical oxygen demand (CBOD) for degradable organic matter at the time of 12:15. The values are discrete (15_MG_L, 20_MG_L, 25_MG_L, and 30_MG_L), indicating concentrations of 15, 20, 25, and 30 mg/L. This variable only influces the variables in the next time step.",
    "CBODD_12_30": "This variable represents the carbonaceous biochemical oxygen demand (CBOD) for degradable organic matter at the time of 12:30. The values are discrete (15_MG_L, 20_MG_L, 25_MG_L, and 30_MG_L), indicating concentrations of 15, 20, 25, and 30 mg/L. This variable only influces the variables in the next time step.",
    "CBODD_12_45": "This variable represents the carbonaceous biochemical oxygen demand (CBOD) for degradable organic matter at the time of 12:45. The values are discrete (15_MG_L, 20_MG_L, 25_MG_L, and 30_MG_L), indicating concentrations of 15, 20, 25, and 30 mg/L. This variable only influces the variables in the next time step.",
    "CKND_12_00": "This variable represents the concentration of Kjeldahl nitrogen degradable (CKND) ions at the time of 12:00. The values are discrete (2_MG_L, 4_MG_L, and 6_MG_L), indicating concentrations of 2, 4, and 6 mg/L. This variable only influces the variables in the next time step.",
    "CKND_12_15": "This variable represents the concentration of Kjeldahl nitrogen degradable (CKND) ions at the time of 12:15. The values are discrete (2_MG_L, 4_MG_L, and 6_MG_L), indicating concentrations of 2, 4, and 6 mg/L. This variable only influces the variables in the next time step.",
    "CKND_12_30": "This variable represents the concentration of Kjeldahl nitrogen degradable (CKND) ions at the time of 12:30. The values are discrete (2_MG_L, 4_MG_L, and 6_MG_L), indicating concentrations of 2, 4, and 6 mg/L. This variable only influces the variables in the next time step.",
    "CKND_12_45": "This variable represents the concentration of Kjeldahl nitrogen degradable (CKND) ions at the time of 12:45. The values are discrete (2_MG_L, 4_MG_L, and 6_MG_L), indicating concentrations of 2, 4, and 6 mg/L. This variable only influces the variables in the next time step.",
    "CNOD_12_00": "This variable represents the concentration of nitrogenous oxygen demand (CNOD) at the time of 12:00. The values are discrete (0_5_MG_L, 1_MG_L, 2_MG_L, and 4_MG_L), indicating concentrations of 0.5, 1, 2, and 4 mg/L. This variable only influces the variables in the next time step.",
    "CNOD_12_15": "This variable represents the concentration of nitrogenous oxygen demand (CNOD) at the time of 12:15. The values are discrete (0_5_MG_L, 1_MG_L, 2_MG_L, and 4_MG_L), indicating concentrations of 0.5, 1, 2, and 4 mg/L. This variable only influces the variables in the next time step.",
    "CNOD_12_30": "This variable represents the concentration of nitrogenous oxygen demand (CNOD) at the time of 12:30. The values are discrete (0_5_MG_L, 1_MG_L, 2_MG_L, and 4_MG_L), indicating concentrations of 0.5, 1, 2, and 4 mg/L. This variable only influces the variables in the next time step.",
    "CNOD_12_45": "This variable represents the concentration of nitrogenous oxygen demand (CNOD) at the time of 12:45. The values are discrete (0_5_MG_L, 1_MG_L, 2_MG_L, and 4_MG_L), indicating concentrations of 0.5, 1, 2, and 4 mg/L. This variable only influces the variables in the next time step.",
    "CBODN_12_00": "This variable represents the carbonaceous biochemical oxygen demand for nitrogenous (CBODN) matter at the time of 12:00. The values are discrete (5_MG_L, 10_MG_L, 15_MG_L, and 20_MG_L), indicating concentrations of 5, 10, 15, and 20 mg/L. This variable only influces the variables in the next time step.",
    "CBODN_12_15": "This variable represents the carbonaceous biochemical oxygen demand for nitrogenous (CBODN) matter at the time of 12:15. The values are discrete (5_MG_L, 10_MG_L, 15_MG_L, and 20_MG_L), indicating concentrations of 5, 10, 15, and 20 mg/L. This variable only influces the variables in the next time step.",
    "CBODN_12_30": "This variable represents the carbonaceous biochemical oxygen demand for nitrogenous (CBODN) matter at the time of 12:30. The values are discrete (5_MG_L, 10_MG_L, 15_MG_L, and 20_MG_L), indicating concentrations of 5, 10, 15, and 20 mg/L. This variable only influces the variables in the next time step.",
    "CBODN_12_45": "This variable represents the carbonaceous biochemical oxygen demand for nitrogenous (CBODN) matter at the time of 12:45. The values are discrete (5_MG_L, 10_MG_L, 15_MG_L, and 20_MG_L), indicating concentrations of 5, 10, 15, and 20 mg/L. This variable only influces the variables in the next time step.",
    "CKNN_12_00": "This variable represents the concentration of Kjeldahl nitrogen non-degradable (CKNN) ions at the time of 12:00. The values are discrete (0_5_MG_L, 1_MG_L, and 2_MG_L), indicating concentrations of 0.5, 1, and 2 mg/L. This variable only influces the variables in the next time step.",
    "CKNN_12_15": "This variable represents the concentration of Kjeldahl nitrogen non-degradable (CKNN) ions at the time of 12:15. The values are discrete (0_5_MG_L, 1_MG_L, and 2_MG_L), indicating concentrations of 0.5, 1, and 2 mg/L. This variable only influces the variables in the next time step.",
    "CKNN_12_30": "This variable represents the concentration of Kjeldahl nitrogen non-degradable (CKNN) ions at the time of 12:30. The values are discrete (0_5_MG_L, 1_MG_L, and 2_MG_L), indicating concentrations of 0.5, 1, and 2 mg/L. This variable only influces the variables in the next time step.",
    "CKNN_12_45": "This variable represents the concentration of Kjeldahl nitrogen non-degradable (CKNN) ions at the time of 12:45. The values are discrete (0_5_MG_L, 1_MG_L, and 2_MG_L), indicating concentrations of 0.5, 1, and 2 mg/L. This variable only influces the variables in the next time step.",
    "CNON_12_00": "This variable represents the concentration of nitrogenous non-degradable organic nitrogen (CNON) at the time of 12:00. The values are discrete (2_MG_L, 4_MG_L, 6_MG_L, and 10_MG_L), indicating concentrations of 2, 4, 6, and 10 mg/L. This variable only influces the variables in the next time step.",
    "CNON_12_15": "This variable represents the concentration of nitrogenous non-degradable organic nitrogen (CNON) at the time of 12:15. The values are discrete (2_MG_L, 4_MG_L, 6_MG_L, and 10_MG_L), indicating concentrations of 2, 4, 6, and 10 mg/L. This variable only influces the variables in the next time step.",
    "CNON_12_30": "This variable represents the concentration of nitrogenous non-degradable organic nitrogen (CNON) at the time of 12:30. The values are discrete (2_MG_L, 4_MG_L, 6_MG_L, and 10_MG_L), indicating concentrations of 2, 4, 6, and 10 mg/L. This variable only influces the variables in the next time step.",
    "CNON_12_45": "This variable represents the concentration of nitrogenous non-degradable organic nitrogen (CNON) at the time of 12:45. The values are discrete (2_MG_L, 4_MG_L, 6_MG_L, and 10_MG_L), indicating concentrations of 2, 4, 6, and 10 mg/L. This variable only influces the variables in the next time step."
}

variable_description_water = {
    rename_mapping_water[k]: v for k, v in ils_desc.items() if k in rename_mapping_water
}

value_mappings_water = {
    'Inorganic Nitrogen Ion Count at 12:00':        {'3': 0, '4': 1, '5': 2, '6': 3},
    'Inorganic Nitrogen Ion Count at 12:15':        {'3': 0, '4': 1, '5': 2, '6': 3},
    'Inorganic Nitrogen Ion Count at 12:30':        {'3': 0, '4': 1, '5': 2, '6': 3},
    'Inorganic Nitrogen Ion Count at 12:45':        {'3': 0, '4': 1, '5': 2, '6': 3},

    'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:00': {'20_MG_L': 0, '30_MG_L': 1, '40_MG_L': 2},
    'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:15': {'20_MG_L': 0, '30_MG_L': 1, '40_MG_L': 2},
    'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:30': {'20_MG_L': 0, '30_MG_L': 1, '40_MG_L': 2},
    'Kjeldahl Inorganic Nitrogen Concentration (mg/L) at 12:45': {'20_MG_L': 0, '30_MG_L': 1, '40_MG_L': 2},

    'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:00': {'20_MG_L': 0},
    'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:15': {'15_MG_L': 0, '20_MG_L': 1, '25_MG_L': 2},
    'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:30': {'15_MG_L': 0, '20_MG_L': 1, '25_MG_L': 2, '30_MG_L': 3},
    'Carbonaceous Biochemical Oxygen Demand for Degradable Organic Matter (mg/L) at 12:45': {'15_MG_L': 0, '20_MG_L': 1, '25_MG_L': 2, '30_MG_L': 3},

    'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:00': {'4_MG_L': 0},
    'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:15': {'4_MG_L': 0, '6_MG_L': 1},
    'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:30': {'4_MG_L': 0, '6_MG_L': 1},
    'Kjeldahl Nitrogen Degradable Concentration (mg/L) at 12:45': {'4_MG_L': 0, '6_MG_L': 1},

    'Nitrogenous Oxygen Demand (mg/L) at 12:00':           {'1_MG_L': 0},
    'Nitrogenous Oxygen Demand (mg/L) at 12:15':           {'0_5_MG_L': 0, '1_MG_L': 1},
    'Nitrogenous Oxygen Demand (mg/L) at 12:30':           {'0_5_MG_L': 0, '1_MG_L': 1},
    'Nitrogenous Oxygen Demand (mg/L) at 12:45':           {'0_5_MG_L': 0, '1_MG_L': 1},

    'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:00': {'10_MG_L': 0},
    'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:15': {'10_MG_L': 0, '15_MG_L': 1},
    'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:30': {'5_MG_L': 0, '10_MG_L': 1, '15_MG_L': 2},
    'Carbonaceous Biochemical Oxygen Demand for Nitrogenous Matter (mg/L) at 12:45': {'5_MG_L': 0, '10_MG_L': 1, '15_MG_L': 2, '20_MG_L': 3},

    'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:00': {'1_MG_L': 0},
    'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:15': {'0_5_MG_L': 0, '1_MG_L': 1},
    'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:30': {'0_5_MG_L': 0, '1_MG_L': 1},
    'Kjeldahl Nitrogen Non-degradable Concentration (mg/L) at 12:45': {'0_5_MG_L': 0, '1_MG_L': 1},

    'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:00': {'4_MG_L': 0},
    'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:15': {'4_MG_L': 0, '6_MG_L': 1},
    'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:30': {'2_MG_L': 0, '4_MG_L': 1, '6_MG_L': 2},
    'Non-degradable Organic Nitrogen Concentration (mg/L) at 12:45': {'2_MG_L': 0, '4_MG_L': 1, '6_MG_L': 2},
}

dataset_description_water = (
    "A Bayesian network modeling water quality in some wastewater treatment control system by capturing interactions among nitrogen compounds "
    "and oxygen demand metrics in four time steps (12:00, 12:15, 12:30, 12:45). It reflects how concentrations of various "
    "chemical species influence each other over time, supporting analysis of aquatic pollutant behavior."
)

def fetch_water():
    df = pd.read_csv(f'/net/dali/home/mscbio/rul98/CausalLLM/data/water_20000.csv')

    # Normalize all missing values (including string "<NA>")
    df = df.replace(["<NA>", "nan", pd.NA], "None")

    # Rename columns
    df = df.rename(columns=rename_mapping_water)

    # Apply value mappings
    for col, mapping in value_mappings_water.items():
        if col in df.columns:
            df[col] = df[col].astype("str").map(mapping).astype("Int64")  

    reader = BIFReader(f'/net/dali/home/mscbio/rul98/CausalLLM/data/water.bif')
    G_model = reader.get_model()

    # Create a directed graph from the edges
    GroundTruth = nx.DiGraph()
    GroundTruth.add_nodes_from(G_model.nodes())
    GroundTruth.add_edges_from(G_model.edges())
    GroundTruth = nx.relabel_nodes(GroundTruth, rename_mapping_water)
    pos_data = nx.spring_layout(GroundTruth)
    # print(set(GroundTruth.nodes()) - set(df.columns), set(df.columns) - set(GroundTruth.nodes()))
    return df, GroundTruth, pos_data