{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "db5d9ea3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from data.datasets import fetch_with_metadata\n",
    "# for dsname in ['asia', 'alarm', 'cancer', 'child', 'hailfinder', 'hepar2', 'insurance', 'mildew', 'water', 'win95pts']:\n",
    "#     print(dsname)\n",
    "#     data = fetch_with_metadata(dsname)\n",
    "#     df, GroundTruth, pos_data = data['data']\n",
    "#     variable_description = data['variable_description']\n",
    "#     dataset_description = data['dataset_description']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e91a1341",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "α = 1e-05\n",
      "orig\n",
      "PC {'precision': 0.7402597402597403, 'recall': 0.5, 'f1_score': 0.5968586387434556, 'true_positives': 28.5, 'false_positives': 10, 'false_negatives': 28.5, 'shd': 29.5, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9506172839506173, 'recall': 0.7264150943396226, 'f1_score': 0.8235294117647058, 'true_positives': 38.5, 'false_positives': 2, 'false_negatives': 14.5, 'shd': 15.5, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.7560975609756098, 'recall': 0.5961538461538461, 'f1_score': 0.6666666666666667, 'true_positives': 31, 'false_positives': 10, 'false_negatives': 21, 'shd': 22.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.7560975609756098, 'recall': 0.5961538461538461, 'f1_score': 0.6666666666666667, 'true_positives': 31, 'false_positives': 10, 'false_negatives': 21, 'shd': 22.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.926829268292683, 'recall': 0.7307692307692307, 'f1_score': 0.8172043010752689, 'true_positives': 38, 'false_positives': 3, 'false_negatives': 14, 'shd': 15.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.7560975609756098, 'recall': 0.5961538461538461, 'f1_score': 0.6666666666666667, 'true_positives': 31, 'false_positives': 10, 'false_negatives': 21, 'shd': 22.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.7317073170731707, 'recall': 0.5769230769230769, 'f1_score': 0.6451612903225806, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.7317073170731707, 'recall': 0.5769230769230769, 'f1_score': 0.6451612903225806, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "stable\n",
      "PC {'precision': 0.7974683544303798, 'recall': 0.5727272727272728, 'f1_score': 0.6666666666666667, 'true_positives': 31.5, 'false_positives': 8, 'false_negatives': 23.5, 'shd': 24.5, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9512195121951219, 'recall': 0.75, 'f1_score': 0.8387096774193549, 'true_positives': 39, 'false_positives': 2, 'false_negatives': 13, 'shd': 14.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.9512195121951219, 'recall': 0.75, 'f1_score': 0.8387096774193549, 'true_positives': 39, 'false_positives': 2, 'false_negatives': 13, 'shd': 14.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.7317073170731707, 'recall': 0.5769230769230769, 'f1_score': 0.6451612903225806, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "conservative\n",
      "PC {'precision': 0.7974683544303798, 'recall': 0.5727272727272728, 'f1_score': 0.6666666666666667, 'true_positives': 31.5, 'false_positives': 8, 'false_negatives': 23.5, 'shd': 24.5, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9506172839506173, 'recall': 0.7264150943396226, 'f1_score': 0.8235294117647058, 'true_positives': 38.5, 'false_positives': 2, 'false_negatives': 14.5, 'shd': 15.5, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.926829268292683, 'recall': 0.7307692307692307, 'f1_score': 0.8172043010752689, 'true_positives': 38, 'false_positives': 3, 'false_negatives': 14, 'shd': 15.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.8048780487804879, 'recall': 0.6346153846153846, 'f1_score': 0.7096774193548387, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 19, 'shd': 20.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.7317073170731707, 'recall': 0.5769230769230769, 'f1_score': 0.6451612903225806, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 41, 'num_truth_edges': 52}\n",
      "α = 0.01\n",
      "orig\n",
      "PC {'precision': 0.7831325301204819, 'recall': 0.5508474576271186, 'f1_score': 0.6467661691542289, 'true_positives': 32.5, 'false_positives': 9, 'false_negatives': 26.5, 'shd': 28.5, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9333333333333333, 'recall': 0.8076923076923077, 'f1_score': 0.8659793814432989, 'true_positives': 42, 'false_positives': 3, 'false_negatives': 10, 'shd': 12.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.8, 'recall': 0.6923076923076923, 'f1_score': 0.7422680412371134, 'true_positives': 36, 'false_positives': 9, 'false_negatives': 16, 'shd': 18.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.8, 'recall': 0.6923076923076923, 'f1_score': 0.7422680412371134, 'true_positives': 36, 'false_positives': 9, 'false_negatives': 16, 'shd': 18.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.9333333333333333, 'recall': 0.8076923076923077, 'f1_score': 0.8659793814432989, 'true_positives': 42, 'false_positives': 3, 'false_negatives': 10, 'shd': 12.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.8, 'recall': 0.6923076923076923, 'f1_score': 0.7422680412371134, 'true_positives': 36, 'false_positives': 9, 'false_negatives': 16, 'shd': 18.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.8, 'recall': 0.6923076923076923, 'f1_score': 0.7422680412371134, 'true_positives': 36, 'false_positives': 9, 'false_negatives': 16, 'shd': 18.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.6888888888888889, 'recall': 0.5961538461538461, 'f1_score': 0.6391752577319588, 'true_positives': 31, 'false_positives': 14, 'false_negatives': 21, 'shd': 23.0, 'num_inferred_edges': 45, 'num_truth_edges': 52}\n",
      "stable\n",
      "PC {'precision': 0.8, 'recall': 0.5517241379310345, 'f1_score': 0.6530612244897959, 'true_positives': 32.0, 'false_positives': 8, 'false_negatives': 26.0, 'shd': 27.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9302325581395349, 'recall': 0.7692307692307693, 'f1_score': 0.8421052631578948, 'true_positives': 40, 'false_positives': 3, 'false_negatives': 12, 'shd': 13.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.9302325581395349, 'recall': 0.7692307692307693, 'f1_score': 0.8421052631578948, 'true_positives': 40, 'false_positives': 3, 'false_negatives': 12, 'shd': 13.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.6976744186046512, 'recall': 0.5769230769230769, 'f1_score': 0.631578947368421, 'true_positives': 30, 'false_positives': 13, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "conservative\n",
      "PC {'precision': 0.8, 'recall': 0.5517241379310345, 'f1_score': 0.6530612244897959, 'true_positives': 32.0, 'false_positives': 8, 'false_negatives': 26.0, 'shd': 27.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.9534883720930233, 'recall': 0.7884615384615384, 'f1_score': 0.8631578947368421, 'true_positives': 41, 'false_positives': 2, 'false_negatives': 11, 'shd': 12.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.813953488372093, 'recall': 0.6730769230769231, 'f1_score': 0.736842105263158, 'true_positives': 35, 'false_positives': 8, 'false_negatives': 17, 'shd': 18.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.9534883720930233, 'recall': 0.7884615384615384, 'f1_score': 0.8631578947368421, 'true_positives': 41, 'false_positives': 2, 'false_negatives': 11, 'shd': 12.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.813953488372093, 'recall': 0.6730769230769231, 'f1_score': 0.736842105263158, 'true_positives': 35, 'false_positives': 8, 'false_negatives': 17, 'shd': 18.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.7906976744186046, 'recall': 0.6538461538461539, 'f1_score': 0.7157894736842104, 'true_positives': 34, 'false_positives': 9, 'false_negatives': 18, 'shd': 19.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.6976744186046512, 'recall': 0.5769230769230769, 'f1_score': 0.631578947368421, 'true_positives': 30, 'false_positives': 13, 'false_negatives': 22, 'shd': 23.0, 'num_inferred_edges': 43, 'num_truth_edges': 52}\n",
      "α = 0.5\n",
      "orig\n",
      "PC {'precision': 0.72, 'recall': 0.6206896551724138, 'f1_score': 0.6666666666666666, 'true_positives': 36.0, 'false_positives': 14, 'false_negatives': 22.0, 'shd': 29.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.8285714285714286, 'recall': 0.8207547169811321, 'f1_score': 0.8246445497630333, 'true_positives': 43.5, 'false_positives': 9, 'false_negatives': 9.5, 'shd': 16.5, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.7169811320754716, 'recall': 0.7307692307692307, 'f1_score': 0.7238095238095238, 'true_positives': 38, 'false_positives': 15, 'false_negatives': 14, 'shd': 21.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.6981132075471698, 'recall': 0.7115384615384616, 'f1_score': 0.7047619047619047, 'true_positives': 37, 'false_positives': 16, 'false_negatives': 15, 'shd': 22.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.8301886792452831, 'recall': 0.8461538461538461, 'f1_score': 0.8380952380952382, 'true_positives': 44, 'false_positives': 9, 'false_negatives': 8, 'shd': 15.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.7169811320754716, 'recall': 0.7307692307692307, 'f1_score': 0.7238095238095238, 'true_positives': 38, 'false_positives': 15, 'false_negatives': 14, 'shd': 21.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.6981132075471698, 'recall': 0.7115384615384616, 'f1_score': 0.7047619047619047, 'true_positives': 37, 'false_positives': 16, 'false_negatives': 15, 'shd': 22.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.6037735849056604, 'recall': 0.6153846153846154, 'f1_score': 0.6095238095238096, 'true_positives': 32, 'false_positives': 21, 'false_negatives': 20, 'shd': 27.0, 'num_inferred_edges': 53, 'num_truth_edges': 52}\n",
      "stable\n",
      "PC {'precision': 0.7872340425531915, 'recall': 0.6379310344827587, 'f1_score': 0.7047619047619048, 'true_positives': 37.0, 'false_positives': 10, 'false_negatives': 21.0, 'shd': 25.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.8787878787878788, 'recall': 0.8207547169811321, 'f1_score': 0.8487804878048781, 'true_positives': 43.5, 'false_positives': 6, 'false_negatives': 9.5, 'shd': 13.5, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.78, 'recall': 0.75, 'f1_score': 0.7647058823529411, 'true_positives': 39, 'false_positives': 11, 'false_negatives': 13, 'shd': 17.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.76, 'recall': 0.7307692307692307, 'f1_score': 0.7450980392156863, 'true_positives': 38, 'false_positives': 12, 'false_negatives': 14, 'shd': 18.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.88, 'recall': 0.8461538461538461, 'f1_score': 0.8627450980392156, 'true_positives': 44, 'false_positives': 6, 'false_negatives': 8, 'shd': 12.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.78, 'recall': 0.75, 'f1_score': 0.7647058823529411, 'true_positives': 39, 'false_positives': 11, 'false_negatives': 13, 'shd': 17.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.76, 'recall': 0.7307692307692307, 'f1_score': 0.7450980392156863, 'true_positives': 38, 'false_positives': 12, 'false_negatives': 14, 'shd': 18.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.64, 'recall': 0.6153846153846154, 'f1_score': 0.6274509803921569, 'true_positives': 32, 'false_positives': 18, 'false_negatives': 20, 'shd': 24.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "conservative\n",
      "PC {'precision': 0.7872340425531915, 'recall': 0.6379310344827587, 'f1_score': 0.7047619047619048, 'true_positives': 37.0, 'false_positives': 10, 'false_negatives': 21.0, 'shd': 25.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Ours {'precision': 0.898989898989899, 'recall': 0.839622641509434, 'f1_score': 0.8682926829268292, 'true_positives': 44.5, 'false_positives': 5, 'false_negatives': 8.5, 'shd': 12.5, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Meek2013 {'precision': 0.8, 'recall': 0.7692307692307693, 'f1_score': 0.7843137254901961, 'true_positives': 40, 'false_positives': 10, 'false_negatives': 12, 'shd': 16.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "PC + LLM order {'precision': 0.78, 'recall': 0.75, 'f1_score': 0.7647058823529411, 'true_positives': 39, 'false_positives': 11, 'false_negatives': 13, 'shd': 17.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Ours + LLM order {'precision': 0.9, 'recall': 0.8653846153846154, 'f1_score': 0.8823529411764707, 'true_positives': 45, 'false_positives': 5, 'false_negatives': 7, 'shd': 11.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Meek2013 + LLM order {'precision': 0.8, 'recall': 0.7692307692307693, 'f1_score': 0.7843137254901961, 'true_positives': 40, 'false_positives': 10, 'false_negatives': 12, 'shd': 16.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Collider + LLM order {'precision': 0.8, 'recall': 0.7692307692307693, 'f1_score': 0.7843137254901961, 'true_positives': 40, 'false_positives': 10, 'false_negatives': 12, 'shd': 16.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n",
      "Skeleton + LLM order {'precision': 0.64, 'recall': 0.6153846153846154, 'f1_score': 0.6274509803921569, 'true_positives': 32, 'false_positives': 18, 'false_negatives': 20, 'shd': 24.0, 'num_inferred_edges': 50, 'num_truth_edges': 52}\n"
     ]
    }
   ],
   "source": [
    "from data.datasets import fetch_with_metadata\n",
    "dsname = 'insurance'\n",
    "data = fetch_with_metadata(dsname)\n",
    "df, GroundTruth, pos_data = data['data']\n",
    "variable_description = data['variable_description']\n",
    "dataset_description = data['dataset_description']\n",
    "import pandas as pd\n",
    "import logging\n",
    "from pathlib import Path\n",
    "from utils import *\n",
    "# significance_levels = [1e-5, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 8e-1]\n",
    "significance_levels = [1e-5, 1e-2, 5e-1]\n",
    "N_VOTE = 5\n",
    "LLM_MODEL = \"gpt-4o-mini\"\n",
    "TEMP = 0.6\n",
    "# container for results\n",
    "results = {\n",
    "    \"PC\": [],\n",
    "    \"LLM\": [],\n",
    "    \"PC_order\": [],\n",
    "    \"LLM_order\": []\n",
    "}\n",
    "\n",
    "def evaluate_one_level(df, alpha, variant=\"orig\"):\n",
    "    # 1) skeleton & collider orientation\n",
    "    skel, sep_sets, pval = build_skeleton_CI(\n",
    "        data=df,\n",
    "        variant=variant,\n",
    "        ci_test=\"chi_square\",\n",
    "        significance_level=alpha,\n",
    "        max_cond_vars=3,\n",
    "        expert_knowledge=None,\n",
    "        enforce_expert_knowledge=False,\n",
    "        n_jobs=-1,\n",
    "        show_progress=False,\n",
    "        data_description=dataset_description,\n",
    "        node_description=variable_description,\n",
    "    )\n",
    "    pdag_colliders = orient_colliders(skel, sep_sets)\n",
    "    pdag = pdag_colliders.apply_meeks_rules(apply_r4=False)\n",
    "    pdag.add_nodes_from(set(df.columns) - set(pdag.nodes()))\n",
    "\n",
    "    cache_path = f\"vote_dir/{dsname}_{alpha}_{variant}.csv\"\n",
    "    cache_file = Path(cache_path)\n",
    "    if cache_file.exists():\n",
    "        try:\n",
    "            df_vote_dir = pd.read_csv(cache_path)\n",
    "            vote_dir = {\n",
    "                (row[\"Source\"], row[\"Target\"], row[\"Direction\"]): row[\"Votes\"]\n",
    "                for _, row in df_vote_dir.iterrows()\n",
    "            }\n",
    "        except Exception:\n",
    "            print(f'Fail Reading {cache_path}')\n",
    "    else: \n",
    "        # 2) LLM-based pair-wise orientation\n",
    "        vote_dir = vote_edges(\n",
    "            dataset_description, variable_description,\n",
    "            skel, sep_sets, pval, n_votes=N_VOTE, temperature=TEMP, model=LLM_MODEL\n",
    "        )\n",
    "        df_vote_dir = pd.DataFrame(\n",
    "            [(src, tgt, direction, count) for (src, tgt, direction), count in vote_dir.items()],\n",
    "            columns=[\"Source\", \"Target\", \"Direction\", \"Votes\"]\n",
    "        )\n",
    "        df_vote_dir.to_csv(cache_path, index=False)\n",
    "    directed_voted, undirected_voted = edges_from_votes(skel.edges(), vote_dir)\n",
    "    G_oriented = orient_edges(skel, directed_voted, undirected_voted, sep_sets, pval)\n",
    "    \n",
    "    pdag_Meek = pdag.copy()\n",
    "    for (u, v) in directed_voted:\n",
    "        if pdag_Meek.has_undirected_edge(u, v):\n",
    "            pdag_Meek.orient_undirected_edge(u, v, inplace=True)\n",
    "        elif pdag_Meek.has_undirected_edge(v, u):\n",
    "            pdag_Meek.orient_undirected_edge(v, u, inplace=True)\n",
    "\n",
    "    # 3) compute metrics for the raw PDAGs\n",
    "    pc_dir,  pc_und  = split_directed_undirected_edges(pdag)\n",
    "    llm_dir, llm_und = split_directed_undirected_edges(G_oriented)\n",
    "    meek_dir,meek_und  = split_directed_undirected_edges(pdag_Meek)\n",
    "\n",
    "    acc_pc  = accuracy_metrics(GroundTruth, pc_dir,  pc_und)\n",
    "    acc_llm = accuracy_metrics(GroundTruth, llm_dir, llm_und)\n",
    "    acc_meek= accuracy_metrics(GroundTruth, meek_dir, meek_und)\n",
    "    print('PC', acc_pc)\n",
    "    print('Ours', acc_llm)\n",
    "    print('Meek2013', acc_meek)\n",
    "    \n",
    "    # 4) apply the vote-based causal ordering, then metrics again\n",
    "    G_pc_ord  = order_PC(pdag,       vote_dir)\n",
    "    G_llm_ord = order_PC(G_oriented, vote_dir)\n",
    "    G_meek_ord= order_PC(pdag_Meek, vote_dir)\n",
    "    G_v_ord = order_PC(pdag_colliders, vote_dir)\n",
    "    G_skel_ord= order_PC(skel, vote_dir)\n",
    "\n",
    "    pcord_dir,  pcord_und  = split_directed_undirected_edges(G_pc_ord)\n",
    "    llmord_dir, llmord_und = split_directed_undirected_edges(G_llm_ord)\n",
    "    meekord_dir,meekord_und= split_directed_undirected_edges(G_meek_ord)\n",
    "    vord_dir, vord_und = split_directed_undirected_edges(G_v_ord)\n",
    "    skelord_dir,skelord_und= split_directed_undirected_edges(G_skel_ord)\n",
    "\n",
    "    acc_pc_ord  = accuracy_metrics(GroundTruth, pcord_dir,  pcord_und)\n",
    "    acc_llm_ord = accuracy_metrics(GroundTruth, llmord_dir, llmord_und)\n",
    "    acc_meek_ord= accuracy_metrics(GroundTruth, meekord_dir, meekord_und)\n",
    "    acc_v_ord = accuracy_metrics(GroundTruth, vord_dir, vord_und)\n",
    "    acc_skel_ord= accuracy_metrics(GroundTruth, skelord_dir, skelord_und)\n",
    "    print('PC + LLM order', acc_pc_ord)\n",
    "    print('Ours + LLM order', acc_llm_ord)\n",
    "    print('Meek2013 + LLM order', acc_meek_ord)\n",
    "    print('Collider + LLM order', acc_v_ord)\n",
    "    print('Skeleton + LLM order', acc_skel_ord)\n",
    "\n",
    "    # add α so it’s easy to filter later\n",
    "    for d in (acc_pc, acc_llm, acc_pc_ord, acc_llm_ord):\n",
    "        d[\"significance_level\"] = alpha\n",
    "\n",
    "    return acc_pc, acc_llm, acc_pc_ord, acc_llm_ord\n",
    "\n",
    "# --- MAIN LOOP ---------------------------------------------------------------\n",
    "logging.disable(logging.CRITICAL)\n",
    "\n",
    "for alpha in significance_levels:\n",
    "    print(f\"α = {alpha}\")\n",
    "    for variant in ['orig', 'stable', 'conservative']:\n",
    "        print(variant)\n",
    "        pc, llm, pc_ord, llm_ord = evaluate_one_level(df, alpha, variant)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c9b7f1b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "α = 1e-05\n",
      "orig\n",
      "PC {'precision': 0.6647398843930635, 'recall': 0.46747967479674796, 'f1_score': 0.548926014319809, 'true_positives': 57.5, 'false_positives': 29, 'false_negatives': 65.5, 'shd': 76.5, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.7865168539325843, 'recall': 0.5932203389830508, 'f1_score': 0.6763285024154589, 'true_positives': 70.0, 'false_positives': 19, 'false_negatives': 48.0, 'shd': 59.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.6666666666666666, 'recall': 0.5172413793103449, 'f1_score': 0.5825242718446603, 'true_positives': 60.0, 'false_positives': 30, 'false_negatives': 56.0, 'shd': 67.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.6630434782608695, 'recall': 0.5446428571428571, 'f1_score': 0.5980392156862744, 'true_positives': 61, 'false_positives': 31, 'false_negatives': 51, 'shd': 62.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.7608695652173914, 'recall': 0.625, 'f1_score': 0.6862745098039216, 'true_positives': 70, 'false_positives': 22, 'false_negatives': 42, 'shd': 53.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.6630434782608695, 'recall': 0.5446428571428571, 'f1_score': 0.5980392156862744, 'true_positives': 61, 'false_positives': 31, 'false_negatives': 51, 'shd': 62.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7391304347826086, 'recall': 0.6071428571428571, 'f1_score': 0.6666666666666666, 'true_positives': 68, 'false_positives': 24, 'false_negatives': 44, 'shd': 55.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.8804347826086957, 'recall': 0.7232142857142857, 'f1_score': 0.7941176470588235, 'true_positives': 81, 'false_positives': 11, 'false_negatives': 31, 'shd': 42.0, 'num_inferred_edges': 92, 'num_truth_edges': 112}\n",
      "stable\n",
      "PC {'precision': 0.7668711656441718, 'recall': 0.5, 'f1_score': 0.6053268765133172, 'true_positives': 62.5, 'false_positives': 19, 'false_negatives': 62.5, 'shd': 69.5, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.8816568047337278, 'recall': 0.6260504201680672, 'f1_score': 0.7321867321867321, 'true_positives': 74.5, 'false_positives': 10, 'false_negatives': 44.5, 'shd': 51.5, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.7674418604651163, 'recall': 0.5689655172413793, 'f1_score': 0.6534653465346535, 'true_positives': 66.0, 'false_positives': 20, 'false_negatives': 50.0, 'shd': 57.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.7386363636363636, 'recall': 0.5803571428571429, 'f1_score': 0.65, 'true_positives': 65, 'false_positives': 23, 'false_negatives': 47, 'shd': 54.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.8636363636363636, 'recall': 0.6785714285714286, 'f1_score': 0.76, 'true_positives': 76, 'false_positives': 12, 'false_negatives': 36, 'shd': 43.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.75, 'recall': 0.5892857142857143, 'f1_score': 0.6599999999999999, 'true_positives': 66, 'false_positives': 22, 'false_negatives': 46, 'shd': 53.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7840909090909091, 'recall': 0.6160714285714286, 'f1_score': 0.6900000000000001, 'true_positives': 69, 'false_positives': 19, 'false_negatives': 43, 'shd': 50.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.9204545454545454, 'recall': 0.7232142857142857, 'f1_score': 0.8099999999999999, 'true_positives': 81, 'false_positives': 7, 'false_negatives': 31, 'shd': 38.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "conservative\n",
      "PC {'precision': 0.7668711656441718, 'recall': 0.5, 'f1_score': 0.6053268765133172, 'true_positives': 62.5, 'false_positives': 19, 'false_negatives': 62.5, 'shd': 69.5, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.8579881656804734, 'recall': 0.6092436974789915, 'f1_score': 0.7125307125307125, 'true_positives': 72.5, 'false_positives': 12, 'false_negatives': 46.5, 'shd': 53.5, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.7674418604651163, 'recall': 0.5689655172413793, 'f1_score': 0.6534653465346535, 'true_positives': 66.0, 'false_positives': 20, 'false_negatives': 50.0, 'shd': 57.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.7272727272727273, 'recall': 0.5714285714285714, 'f1_score': 0.64, 'true_positives': 64, 'false_positives': 24, 'false_negatives': 48, 'shd': 55.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.8295454545454546, 'recall': 0.6517857142857143, 'f1_score': 0.73, 'true_positives': 73, 'false_positives': 15, 'false_negatives': 39, 'shd': 46.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.7272727272727273, 'recall': 0.5714285714285714, 'f1_score': 0.64, 'true_positives': 64, 'false_positives': 24, 'false_negatives': 48, 'shd': 55.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7840909090909091, 'recall': 0.6160714285714286, 'f1_score': 0.6900000000000001, 'true_positives': 69, 'false_positives': 19, 'false_negatives': 43, 'shd': 50.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.9204545454545454, 'recall': 0.7232142857142857, 'f1_score': 0.8099999999999999, 'true_positives': 81, 'false_positives': 7, 'false_negatives': 31, 'shd': 38.0, 'num_inferred_edges': 88, 'num_truth_edges': 112}\n",
      "α = 0.01\n",
      "orig\n",
      "PC {'precision': 0.7005347593582888, 'recall': 0.532520325203252, 'f1_score': 0.605080831408776, 'true_positives': 65.5, 'false_positives': 28, 'false_negatives': 57.5, 'shd': 67.5, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.8645833333333334, 'recall': 0.7033898305084746, 'f1_score': 0.7757009345794393, 'true_positives': 83.0, 'false_positives': 13, 'false_negatives': 35.0, 'shd': 45.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.711340206185567, 'recall': 0.5948275862068966, 'f1_score': 0.6478873239436621, 'true_positives': 69.0, 'false_positives': 28, 'false_negatives': 47.0, 'shd': 57.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.6767676767676768, 'recall': 0.5982142857142857, 'f1_score': 0.6350710900473934, 'true_positives': 67, 'false_positives': 32, 'false_negatives': 45, 'shd': 55.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.8383838383838383, 'recall': 0.7410714285714286, 'f1_score': 0.7867298578199051, 'true_positives': 83, 'false_positives': 16, 'false_negatives': 29, 'shd': 39.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.696969696969697, 'recall': 0.6160714285714286, 'f1_score': 0.6540284360189573, 'true_positives': 69, 'false_positives': 30, 'false_negatives': 43, 'shd': 53.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7171717171717171, 'recall': 0.6339285714285714, 'f1_score': 0.6729857819905213, 'true_positives': 71, 'false_positives': 28, 'false_negatives': 41, 'shd': 51.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.898989898989899, 'recall': 0.7946428571428571, 'f1_score': 0.8436018957345972, 'true_positives': 89, 'false_positives': 10, 'false_negatives': 23, 'shd': 33.0, 'num_inferred_edges': 99, 'num_truth_edges': 112}\n",
      "stable\n",
      "PC {'precision': 0.7740112994350282, 'recall': 0.548, 'f1_score': 0.6416861826697893, 'true_positives': 68.5, 'false_positives': 20, 'false_negatives': 56.5, 'shd': 63.5, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.8453038674033149, 'recall': 0.6322314049586777, 'f1_score': 0.7234042553191489, 'true_positives': 76.5, 'false_positives': 14, 'false_negatives': 44.5, 'shd': 51.5, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.782608695652174, 'recall': 0.6101694915254238, 'f1_score': 0.6857142857142858, 'true_positives': 72.0, 'false_positives': 20, 'false_negatives': 46.0, 'shd': 53.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.7578947368421053, 'recall': 0.6428571428571429, 'f1_score': 0.6956521739130435, 'true_positives': 72, 'false_positives': 23, 'false_negatives': 40, 'shd': 47.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.8, 'recall': 0.6785714285714286, 'f1_score': 0.7342995169082126, 'true_positives': 76, 'false_positives': 19, 'false_negatives': 36, 'shd': 43.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.7578947368421053, 'recall': 0.6428571428571429, 'f1_score': 0.6956521739130435, 'true_positives': 72, 'false_positives': 23, 'false_negatives': 40, 'shd': 47.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7789473684210526, 'recall': 0.6607142857142857, 'f1_score': 0.7149758454106279, 'true_positives': 74, 'false_positives': 21, 'false_negatives': 38, 'shd': 45.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.9263157894736842, 'recall': 0.7857142857142857, 'f1_score': 0.8502415458937198, 'true_positives': 88, 'false_positives': 7, 'false_negatives': 24, 'shd': 31.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "conservative\n",
      "PC {'precision': 0.7740112994350282, 'recall': 0.548, 'f1_score': 0.6416861826697893, 'true_positives': 68.5, 'false_positives': 20, 'false_negatives': 56.5, 'shd': 63.5, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.9120879120879121, 'recall': 0.6916666666666667, 'f1_score': 0.7867298578199051, 'true_positives': 83.0, 'false_positives': 8, 'false_negatives': 37.0, 'shd': 44.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.782608695652174, 'recall': 0.6101694915254238, 'f1_score': 0.6857142857142858, 'true_positives': 72.0, 'false_positives': 20, 'false_negatives': 46.0, 'shd': 53.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.7684210526315789, 'recall': 0.6517857142857143, 'f1_score': 0.7053140096618359, 'true_positives': 73, 'false_positives': 22, 'false_negatives': 39, 'shd': 46.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.8842105263157894, 'recall': 0.75, 'f1_score': 0.8115942028985507, 'true_positives': 84, 'false_positives': 11, 'false_negatives': 28, 'shd': 35.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.7684210526315789, 'recall': 0.6517857142857143, 'f1_score': 0.7053140096618359, 'true_positives': 73, 'false_positives': 22, 'false_negatives': 39, 'shd': 46.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.7894736842105263, 'recall': 0.6696428571428571, 'f1_score': 0.7246376811594203, 'true_positives': 75, 'false_positives': 20, 'false_negatives': 37, 'shd': 44.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.9263157894736842, 'recall': 0.7857142857142857, 'f1_score': 0.8502415458937198, 'true_positives': 88, 'false_positives': 7, 'false_negatives': 24, 'shd': 31.0, 'num_inferred_edges': 95, 'num_truth_edges': 112}\n",
      "α = 0.5\n",
      "orig\n",
      "PC {'precision': 0.5615384615384615, 'recall': 0.6186440677966102, 'f1_score': 0.588709677419355, 'true_positives': 73.0, 'false_positives': 57, 'false_negatives': 45.0, 'shd': 80.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.6313725490196078, 'recall': 0.6544715447154471, 'f1_score': 0.6427145708582834, 'true_positives': 80.5, 'false_positives': 47, 'false_negatives': 42.5, 'shd': 77.5, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.5681818181818182, 'recall': 0.6578947368421053, 'f1_score': 0.6097560975609756, 'true_positives': 75.0, 'false_positives': 57, 'false_negatives': 39.0, 'shd': 74.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.5639097744360902, 'recall': 0.6696428571428571, 'f1_score': 0.6122448979591836, 'true_positives': 75, 'false_positives': 58, 'false_negatives': 37, 'shd': 72.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.6165413533834586, 'recall': 0.7321428571428571, 'f1_score': 0.6693877551020407, 'true_positives': 82, 'false_positives': 51, 'false_negatives': 30, 'shd': 65.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.5714285714285714, 'recall': 0.6785714285714286, 'f1_score': 0.6204081632653061, 'true_positives': 76, 'false_positives': 57, 'false_negatives': 36, 'shd': 71.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.5939849624060151, 'recall': 0.7053571428571429, 'f1_score': 0.6448979591836735, 'true_positives': 79, 'false_positives': 54, 'false_negatives': 33, 'shd': 68.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.7368421052631579, 'recall': 0.875, 'f1_score': 0.7999999999999999, 'true_positives': 98, 'false_positives': 35, 'false_negatives': 14, 'shd': 49.0, 'num_inferred_edges': 133, 'num_truth_edges': 112}\n",
      "stable\n",
      "PC {'precision': 0.6182572614107884, 'recall': 0.6157024793388429, 'f1_score': 0.6169772256728778, 'true_positives': 74.5, 'false_positives': 46, 'false_negatives': 46.5, 'shd': 73.5, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.7012448132780082, 'recall': 0.6983471074380165, 'f1_score': 0.6997929606625258, 'true_positives': 84.5, 'false_positives': 36, 'false_negatives': 36.5, 'shd': 63.5, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.6290322580645161, 'recall': 0.6842105263157895, 'f1_score': 0.6554621848739496, 'true_positives': 78.0, 'false_positives': 46, 'false_negatives': 36.0, 'shd': 63.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.616, 'recall': 0.6875, 'f1_score': 0.6497890295358649, 'true_positives': 77, 'false_positives': 48, 'false_negatives': 35, 'shd': 62.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.688, 'recall': 0.7678571428571429, 'f1_score': 0.7257383966244725, 'true_positives': 86, 'false_positives': 39, 'false_negatives': 26, 'shd': 53.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.624, 'recall': 0.6964285714285714, 'f1_score': 0.6582278481012659, 'true_positives': 78, 'false_positives': 47, 'false_negatives': 34, 'shd': 61.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.624, 'recall': 0.6964285714285714, 'f1_score': 0.6582278481012659, 'true_positives': 78, 'false_positives': 47, 'false_negatives': 34, 'shd': 61.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.784, 'recall': 0.875, 'f1_score': 0.8270042194092827, 'true_positives': 98, 'false_positives': 27, 'false_negatives': 14, 'shd': 41.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "conservative\n",
      "PC {'precision': 0.6416666666666667, 'recall': 0.6311475409836066, 'f1_score': 0.6363636363636364, 'true_positives': 77.0, 'false_positives': 43, 'false_negatives': 45.0, 'shd': 72.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Ours {'precision': 0.6872427983539094, 'recall': 0.7016806722689075, 'f1_score': 0.6943866943866944, 'true_positives': 83.5, 'false_positives': 38, 'false_negatives': 35.5, 'shd': 62.5, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Meek2013 {'precision': 0.6518218623481782, 'recall': 0.7, 'f1_score': 0.6750524109014675, 'true_positives': 80.5, 'false_positives': 43, 'false_negatives': 34.5, 'shd': 61.5, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "PC + LLM order {'precision': 0.64, 'recall': 0.7142857142857143, 'f1_score': 0.6751054852320676, 'true_positives': 80, 'false_positives': 45, 'false_negatives': 32, 'shd': 59.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Ours + LLM order {'precision': 0.688, 'recall': 0.7678571428571429, 'f1_score': 0.7257383966244725, 'true_positives': 86, 'false_positives': 39, 'false_negatives': 26, 'shd': 53.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Meek2013 + LLM order {'precision': 0.656, 'recall': 0.7321428571428571, 'f1_score': 0.6919831223628692, 'true_positives': 82, 'false_positives': 43, 'false_negatives': 30, 'shd': 57.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Collider + LLM order {'precision': 0.68, 'recall': 0.7589285714285714, 'f1_score': 0.7172995780590719, 'true_positives': 85, 'false_positives': 40, 'false_negatives': 27, 'shd': 54.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n",
      "Skeleton + LLM order {'precision': 0.784, 'recall': 0.875, 'f1_score': 0.8270042194092827, 'true_positives': 98, 'false_positives': 27, 'false_negatives': 14, 'shd': 41.0, 'num_inferred_edges': 125, 'num_truth_edges': 112}\n"
     ]
    }
   ],
   "source": [
    "from data.datasets import fetch_with_metadata\n",
    "dsname = 'win95pts'\n",
    "data = fetch_with_metadata(dsname)\n",
    "df, GroundTruth, pos_data = data['data']\n",
    "variable_description = data['variable_description']\n",
    "dataset_description = data['dataset_description']\n",
    "import pandas as pd\n",
    "import logging\n",
    "from pathlib import Path\n",
    "from utils import *\n",
    "# significance_levels = [1e-5, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 8e-1]\n",
    "significance_levels = [1e-5, 1e-2, 5e-1]\n",
    "N_VOTE = 5\n",
    "LLM_MODEL = \"gpt-4o-mini\"\n",
    "TEMP = 0.6\n",
    "# container for results\n",
    "results = {\n",
    "    \"PC\": [],\n",
    "    \"LLM\": [],\n",
    "    \"PC_order\": [],\n",
    "    \"LLM_order\": []\n",
    "}\n",
    "\n",
    "def evaluate_one_level(df, alpha, variant=\"orig\"):\n",
    "    # 1) skeleton & collider orientation\n",
    "    skel, sep_sets, pval = build_skeleton_CI(\n",
    "        data=df,\n",
    "        variant=variant,\n",
    "        ci_test=\"chi_square\",\n",
    "        significance_level=alpha,\n",
    "        max_cond_vars=3,\n",
    "        expert_knowledge=None,\n",
    "        enforce_expert_knowledge=False,\n",
    "        n_jobs=-1,\n",
    "        show_progress=False,\n",
    "        data_description=dataset_description,\n",
    "        node_description=variable_description,\n",
    "    )\n",
    "    pdag_colliders = orient_colliders(skel, sep_sets)\n",
    "    pdag = pdag_colliders.apply_meeks_rules(apply_r4=False)\n",
    "    pdag.add_nodes_from(set(df.columns) - set(pdag.nodes()))\n",
    "\n",
    "    cache_path = f\"vote_dir/{dsname}_{alpha}_{variant}.csv\"\n",
    "    cache_file = Path(cache_path)\n",
    "    if cache_file.exists():\n",
    "        try:\n",
    "            df_vote_dir = pd.read_csv(cache_path)\n",
    "            vote_dir = {\n",
    "                (row[\"Source\"], row[\"Target\"], row[\"Direction\"]): row[\"Votes\"]\n",
    "                for _, row in df_vote_dir.iterrows()\n",
    "            }\n",
    "        except Exception:\n",
    "            print(f'Fail Reading {cache_path}')\n",
    "    else: \n",
    "        # 2) LLM-based pair-wise orientation\n",
    "        vote_dir = vote_edges(\n",
    "            dataset_description, variable_description,\n",
    "            skel, sep_sets, pval, n_votes=N_VOTE, temperature=TEMP, model=LLM_MODEL\n",
    "        )\n",
    "        df_vote_dir = pd.DataFrame(\n",
    "            [(src, tgt, direction, count) for (src, tgt, direction), count in vote_dir.items()],\n",
    "            columns=[\"Source\", \"Target\", \"Direction\", \"Votes\"]\n",
    "        )\n",
    "        df_vote_dir.to_csv(cache_path, index=False)\n",
    "    directed_voted, undirected_voted = edges_from_votes(skel.edges(), vote_dir)\n",
    "    G_oriented = orient_edges(skel, directed_voted, undirected_voted, sep_sets, pval)\n",
    "    \n",
    "    pdag_Meek = pdag.copy()\n",
    "    for (u, v) in directed_voted:\n",
    "        if pdag_Meek.has_undirected_edge(u, v):\n",
    "            pdag_Meek.orient_undirected_edge(u, v, inplace=True)\n",
    "        elif pdag_Meek.has_undirected_edge(v, u):\n",
    "            pdag_Meek.orient_undirected_edge(v, u, inplace=True)\n",
    "\n",
    "    # 3) compute metrics for the raw PDAGs\n",
    "    pc_dir,  pc_und  = split_directed_undirected_edges(pdag)\n",
    "    llm_dir, llm_und = split_directed_undirected_edges(G_oriented)\n",
    "    meek_dir,meek_und  = split_directed_undirected_edges(pdag_Meek)\n",
    "\n",
    "    acc_pc  = accuracy_metrics(GroundTruth, pc_dir,  pc_und)\n",
    "    acc_llm = accuracy_metrics(GroundTruth, llm_dir, llm_und)\n",
    "    acc_meek= accuracy_metrics(GroundTruth, meek_dir, meek_und)\n",
    "    print('PC', acc_pc)\n",
    "    print('Ours', acc_llm)\n",
    "    print('Meek2013', acc_meek)\n",
    "    \n",
    "    # 4) apply the vote-based causal ordering, then metrics again\n",
    "    G_pc_ord  = order_PC(pdag,       vote_dir)\n",
    "    G_llm_ord = order_PC(G_oriented, vote_dir)\n",
    "    G_meek_ord= order_PC(pdag_Meek, vote_dir)\n",
    "    G_v_ord = order_PC(pdag_colliders, vote_dir)\n",
    "    G_skel_ord= order_PC(skel, vote_dir)\n",
    "\n",
    "    pcord_dir,  pcord_und  = split_directed_undirected_edges(G_pc_ord)\n",
    "    llmord_dir, llmord_und = split_directed_undirected_edges(G_llm_ord)\n",
    "    meekord_dir,meekord_und= split_directed_undirected_edges(G_meek_ord)\n",
    "    vord_dir, vord_und = split_directed_undirected_edges(G_v_ord)\n",
    "    skelord_dir,skelord_und= split_directed_undirected_edges(G_skel_ord)\n",
    "\n",
    "    acc_pc_ord  = accuracy_metrics(GroundTruth, pcord_dir,  pcord_und)\n",
    "    acc_llm_ord = accuracy_metrics(GroundTruth, llmord_dir, llmord_und)\n",
    "    acc_meek_ord= accuracy_metrics(GroundTruth, meekord_dir, meekord_und)\n",
    "    acc_v_ord = accuracy_metrics(GroundTruth, vord_dir, vord_und)\n",
    "    acc_skel_ord= accuracy_metrics(GroundTruth, skelord_dir, skelord_und)\n",
    "    print('PC + LLM order', acc_pc_ord)\n",
    "    print('Ours + LLM order', acc_llm_ord)\n",
    "    print('Meek2013 + LLM order', acc_meek_ord)\n",
    "    print('Collider + LLM order', acc_v_ord)\n",
    "    print('Skeleton + LLM order', acc_skel_ord)\n",
    "\n",
    "    # add α so it’s easy to filter later\n",
    "    for d in (acc_pc, acc_llm, acc_pc_ord, acc_llm_ord):\n",
    "        d[\"significance_level\"] = alpha\n",
    "\n",
    "    return acc_pc, acc_llm, acc_pc_ord, acc_llm_ord\n",
    "\n",
    "# --- MAIN LOOP ---------------------------------------------------------------\n",
    "logging.disable(logging.CRITICAL)\n",
    "\n",
    "for alpha in significance_levels:\n",
    "    print(f\"α = {alpha}\")\n",
    "    for variant in ['orig', 'stable', 'conservative']:\n",
    "        print(variant)\n",
    "        pc, llm, pc_ord, llm_ord = evaluate_one_level(df, alpha, variant)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8095aed6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "α = 1e-05\n",
      "orig\n",
      "PC {'precision': 0.7021276595744681, 'recall': 0.21428571428571427, 'f1_score': 0.3283582089552239, 'true_positives': 16.5, 'false_positives': 7, 'false_negatives': 60.5, 'shd': 61.5, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.9642857142857143, 'recall': 0.39705882352941174, 'f1_score': 0.5624999999999999, 'true_positives': 27.0, 'false_positives': 1, 'false_negatives': 41.0, 'shd': 42.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.75, 'recall': 0.3088235294117647, 'f1_score': 0.4375, 'true_positives': 21.0, 'false_positives': 7, 'false_negatives': 47.0, 'shd': 48.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.6206896551724138, 'recall': 0.2727272727272727, 'f1_score': 0.37894736842105264, 'true_positives': 18, 'false_positives': 11, 'false_negatives': 48, 'shd': 49.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.896551724137931, 'recall': 0.3939393939393939, 'f1_score': 0.5473684210526315, 'true_positives': 26, 'false_positives': 3, 'false_negatives': 40, 'shd': 41.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.6896551724137931, 'recall': 0.30303030303030304, 'f1_score': 0.4210526315789474, 'true_positives': 20, 'false_positives': 9, 'false_negatives': 46, 'shd': 47.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.6896551724137931, 'recall': 0.30303030303030304, 'f1_score': 0.4210526315789474, 'true_positives': 20, 'false_positives': 9, 'false_negatives': 46, 'shd': 47.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 0.9655172413793104, 'recall': 0.42424242424242425, 'f1_score': 0.5894736842105264, 'true_positives': 28, 'false_positives': 1, 'false_negatives': 38, 'shd': 39.0, 'num_inferred_edges': 29, 'num_truth_edges': 66}\n",
      "stable\n",
      "PC {'precision': 0.8297872340425532, 'recall': 0.26, 'f1_score': 0.39593908629441626, 'true_positives': 19.5, 'false_positives': 4, 'false_negatives': 55.5, 'shd': 55.5, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Ours {'precision': 1.0, 'recall': 0.39705882352941174, 'f1_score': 0.5684210526315789, 'true_positives': 27.0, 'false_positives': 0, 'false_negatives': 41.0, 'shd': 41.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.8518518518518519, 'recall': 0.3382352941176471, 'f1_score': 0.48421052631578954, 'true_positives': 23.0, 'false_positives': 4, 'false_negatives': 45.0, 'shd': 45.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.7857142857142857, 'recall': 0.3333333333333333, 'f1_score': 0.4680851063829786, 'true_positives': 22, 'false_positives': 6, 'false_negatives': 44, 'shd': 44.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.9285714285714286, 'recall': 0.3939393939393939, 'f1_score': 0.5531914893617021, 'true_positives': 26, 'false_positives': 2, 'false_negatives': 40, 'shd': 40.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.7857142857142857, 'recall': 0.3333333333333333, 'f1_score': 0.4680851063829786, 'true_positives': 22, 'false_positives': 6, 'false_negatives': 44, 'shd': 44.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.7857142857142857, 'recall': 0.3333333333333333, 'f1_score': 0.4680851063829786, 'true_positives': 22, 'false_positives': 6, 'false_negatives': 44, 'shd': 44.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 1.0, 'recall': 0.42424242424242425, 'f1_score': 0.5957446808510638, 'true_positives': 28, 'false_positives': 0, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "conservative\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PC {'precision': 0.8723404255319149, 'recall': 0.2733333333333333, 'f1_score': 0.416243654822335, 'true_positives': 20.5, 'false_positives': 3, 'false_negatives': 54.5, 'shd': 54.5, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.9615384615384616, 'recall': 0.35714285714285715, 'f1_score': 0.5208333333333334, 'true_positives': 25.0, 'false_positives': 1, 'false_negatives': 45.0, 'shd': 45.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.8846153846153846, 'recall': 0.32857142857142857, 'f1_score': 0.47916666666666663, 'true_positives': 23.0, 'false_positives': 3, 'false_negatives': 47.0, 'shd': 47.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.7857142857142857, 'recall': 0.3333333333333333, 'f1_score': 0.4680851063829786, 'true_positives': 22, 'false_positives': 6, 'false_negatives': 44, 'shd': 44.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.8928571428571429, 'recall': 0.3787878787878788, 'f1_score': 0.5319148936170213, 'true_positives': 25, 'false_positives': 3, 'false_negatives': 41, 'shd': 41.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.7857142857142857, 'recall': 0.3333333333333333, 'f1_score': 0.4680851063829786, 'true_positives': 22, 'false_positives': 6, 'false_negatives': 44, 'shd': 44.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.8571428571428571, 'recall': 0.36363636363636365, 'f1_score': 0.5106382978723404, 'true_positives': 24, 'false_positives': 4, 'false_negatives': 42, 'shd': 42.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 1.0, 'recall': 0.42424242424242425, 'f1_score': 0.5957446808510638, 'true_positives': 28, 'false_positives': 0, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 28, 'num_truth_edges': 66}\n",
      "α = 0.01\n",
      "orig\n",
      "PC {'precision': 0.8518518518518519, 'recall': 0.3026315789473684, 'f1_score': 0.44660194174757284, 'true_positives': 23.0, 'false_positives': 4, 'false_negatives': 53.0, 'shd': 53.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.9047619047619048, 'recall': 0.4253731343283582, 'f1_score': 0.5786802030456853, 'true_positives': 28.5, 'false_positives': 3, 'false_negatives': 38.5, 'shd': 38.5, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.84375, 'recall': 0.4090909090909091, 'f1_score': 0.5510204081632654, 'true_positives': 27, 'false_positives': 5, 'false_negatives': 39, 'shd': 39.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.8125, 'recall': 0.3939393939393939, 'f1_score': 0.5306122448979591, 'true_positives': 26, 'false_positives': 6, 'false_negatives': 40, 'shd': 40.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.875, 'recall': 0.42424242424242425, 'f1_score': 0.5714285714285714, 'true_positives': 28, 'false_positives': 4, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.84375, 'recall': 0.4090909090909091, 'f1_score': 0.5510204081632654, 'true_positives': 27, 'false_positives': 5, 'false_negatives': 39, 'shd': 39.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.8125, 'recall': 0.3939393939393939, 'f1_score': 0.5306122448979591, 'true_positives': 26, 'false_positives': 6, 'false_negatives': 40, 'shd': 40.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 1.0, 'recall': 0.48484848484848486, 'f1_score': 0.653061224489796, 'true_positives': 32, 'false_positives': 0, 'false_negatives': 34, 'shd': 34.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "stable\n",
      "PC {'precision': 0.8909090909090909, 'recall': 0.32666666666666666, 'f1_score': 0.4780487804878049, 'true_positives': 24.5, 'false_positives': 3, 'false_negatives': 50.5, 'shd': 50.5, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours {'precision': 1.0, 'recall': 0.4701492537313433, 'f1_score': 0.6395939086294417, 'true_positives': 31.5, 'false_positives': 0, 'false_negatives': 35.5, 'shd': 35.5, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.9047619047619048, 'recall': 0.4253731343283582, 'f1_score': 0.5786802030456853, 'true_positives': 28.5, 'false_positives': 3, 'false_negatives': 38.5, 'shd': 38.5, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.84375, 'recall': 0.4090909090909091, 'f1_score': 0.5510204081632654, 'true_positives': 27, 'false_positives': 5, 'false_negatives': 39, 'shd': 39.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.96875, 'recall': 0.4696969696969697, 'f1_score': 0.6326530612244898, 'true_positives': 31, 'false_positives': 1, 'false_negatives': 35, 'shd': 35.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.875, 'recall': 0.42424242424242425, 'f1_score': 0.5714285714285714, 'true_positives': 28, 'false_positives': 4, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.84375, 'recall': 0.4090909090909091, 'f1_score': 0.5510204081632654, 'true_positives': 27, 'false_positives': 5, 'false_negatives': 39, 'shd': 39.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 1.0, 'recall': 0.48484848484848486, 'f1_score': 0.653061224489796, 'true_positives': 32, 'false_positives': 0, 'false_negatives': 34, 'shd': 34.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "conservative\n",
      "PC {'precision': 0.8888888888888888, 'recall': 0.3157894736842105, 'f1_score': 0.4660194174757281, 'true_positives': 24.0, 'false_positives': 3, 'false_negatives': 52.0, 'shd': 52.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.967741935483871, 'recall': 0.4411764705882353, 'f1_score': 0.6060606060606061, 'true_positives': 30.0, 'false_positives': 1, 'false_negatives': 38.0, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.9032258064516129, 'recall': 0.4117647058823529, 'f1_score': 0.5656565656565656, 'true_positives': 28.0, 'false_positives': 3, 'false_negatives': 40.0, 'shd': 40.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.875, 'recall': 0.42424242424242425, 'f1_score': 0.5714285714285714, 'true_positives': 28, 'false_positives': 4, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.9375, 'recall': 0.45454545454545453, 'f1_score': 0.6122448979591837, 'true_positives': 30, 'false_positives': 2, 'false_negatives': 36, 'shd': 36.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.875, 'recall': 0.42424242424242425, 'f1_score': 0.5714285714285714, 'true_positives': 28, 'false_positives': 4, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.875, 'recall': 0.42424242424242425, 'f1_score': 0.5714285714285714, 'true_positives': 28, 'false_positives': 4, 'false_negatives': 38, 'shd': 38.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 1.0, 'recall': 0.48484848484848486, 'f1_score': 0.653061224489796, 'true_positives': 32, 'false_positives': 0, 'false_negatives': 34, 'shd': 34.0, 'num_inferred_edges': 32, 'num_truth_edges': 66}\n",
      "α = 0.5\n",
      "orig\n",
      "PC {'precision': 0.7066666666666667, 'recall': 0.363013698630137, 'f1_score': 0.47963800904977383, 'true_positives': 26.5, 'false_positives': 11, 'false_negatives': 46.5, 'shd': 52.5, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.8048780487804879, 'recall': 0.5, 'f1_score': 0.6168224299065421, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 33, 'shd': 39.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.7317073170731707, 'recall': 0.45454545454545453, 'f1_score': 0.5607476635514018, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 36, 'shd': 42.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.7317073170731707, 'recall': 0.45454545454545453, 'f1_score': 0.5607476635514018, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 36, 'shd': 42.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.8048780487804879, 'recall': 0.5, 'f1_score': 0.6168224299065421, 'true_positives': 33, 'false_positives': 8, 'false_negatives': 33, 'shd': 39.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.7317073170731707, 'recall': 0.45454545454545453, 'f1_score': 0.5607476635514018, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 36, 'shd': 42.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.7317073170731707, 'recall': 0.45454545454545453, 'f1_score': 0.5607476635514018, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 36, 'shd': 42.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 0.8536585365853658, 'recall': 0.5303030303030303, 'f1_score': 0.6542056074766355, 'true_positives': 35, 'false_positives': 6, 'false_negatives': 31, 'shd': 37.0, 'num_inferred_edges': 41, 'num_truth_edges': 66}\n",
      "stable\n",
      "PC {'precision': 0.7352941176470589, 'recall': 0.33783783783783783, 'f1_score': 0.462962962962963, 'true_positives': 25.0, 'false_positives': 9, 'false_negatives': 49.0, 'shd': 52.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.868421052631579, 'recall': 0.5, 'f1_score': 0.6346153846153846, 'true_positives': 33, 'false_positives': 5, 'false_negatives': 33, 'shd': 36.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.7631578947368421, 'recall': 0.4393939393939394, 'f1_score': 0.5576923076923077, 'true_positives': 29, 'false_positives': 9, 'false_negatives': 37, 'shd': 40.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.7368421052631579, 'recall': 0.42424242424242425, 'f1_score': 0.5384615384615385, 'true_positives': 28, 'false_positives': 10, 'false_negatives': 38, 'shd': 41.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.868421052631579, 'recall': 0.5, 'f1_score': 0.6346153846153846, 'true_positives': 33, 'false_positives': 5, 'false_negatives': 33, 'shd': 36.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.7631578947368421, 'recall': 0.4393939393939394, 'f1_score': 0.5576923076923077, 'true_positives': 29, 'false_positives': 9, 'false_negatives': 37, 'shd': 40.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.7368421052631579, 'recall': 0.42424242424242425, 'f1_score': 0.5384615384615385, 'true_positives': 28, 'false_positives': 10, 'false_negatives': 38, 'shd': 41.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 0.9210526315789473, 'recall': 0.5303030303030303, 'f1_score': 0.673076923076923, 'true_positives': 35, 'false_positives': 3, 'false_negatives': 31, 'shd': 34.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "conservative\n",
      "PC {'precision': 0.7941176470588235, 'recall': 0.36486486486486486, 'f1_score': 0.5, 'true_positives': 27.0, 'false_positives': 7, 'false_negatives': 47.0, 'shd': 50.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Ours {'precision': 0.8933333333333333, 'recall': 0.5, 'f1_score': 0.6411483253588517, 'true_positives': 33.5, 'false_positives': 4, 'false_negatives': 33.5, 'shd': 36.5, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Meek2013 {'precision': 0.8133333333333334, 'recall': 0.4552238805970149, 'f1_score': 0.5837320574162679, 'true_positives': 30.5, 'false_positives': 7, 'false_negatives': 36.5, 'shd': 39.5, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "PC + LLM order {'precision': 0.7894736842105263, 'recall': 0.45454545454545453, 'f1_score': 0.5769230769230769, 'true_positives': 30, 'false_positives': 8, 'false_negatives': 36, 'shd': 39.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Ours + LLM order {'precision': 0.8947368421052632, 'recall': 0.5151515151515151, 'f1_score': 0.6538461538461537, 'true_positives': 34, 'false_positives': 4, 'false_negatives': 32, 'shd': 35.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Meek2013 + LLM order {'precision': 0.7894736842105263, 'recall': 0.45454545454545453, 'f1_score': 0.5769230769230769, 'true_positives': 30, 'false_positives': 8, 'false_negatives': 36, 'shd': 39.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Collider + LLM order {'precision': 0.7631578947368421, 'recall': 0.4393939393939394, 'f1_score': 0.5576923076923077, 'true_positives': 29, 'false_positives': 9, 'false_negatives': 37, 'shd': 40.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n",
      "Skeleton + LLM order {'precision': 0.9210526315789473, 'recall': 0.5303030303030303, 'f1_score': 0.673076923076923, 'true_positives': 35, 'false_positives': 3, 'false_negatives': 31, 'shd': 34.0, 'num_inferred_edges': 38, 'num_truth_edges': 66}\n"
     ]
    }
   ],
   "source": [
    "from data.datasets import fetch_with_metadata\n",
    "dsname = 'water'\n",
    "data = fetch_with_metadata(dsname)\n",
    "df, GroundTruth, pos_data = data['data']\n",
    "variable_description = data['variable_description']\n",
    "dataset_description = data['dataset_description']\n",
    "import pandas as pd\n",
    "import logging\n",
    "from pathlib import Path\n",
    "from utils import *\n",
    "# significance_levels = [1e-5, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 8e-1]\n",
    "significance_levels = [1e-5, 1e-2, 5e-1]\n",
    "N_VOTE = 5\n",
    "LLM_MODEL = \"gpt-4o-mini\"\n",
    "TEMP = 0.6\n",
    "# container for results\n",
    "results = {\n",
    "    \"PC\": [],\n",
    "    \"LLM\": [],\n",
    "    \"PC_order\": [],\n",
    "    \"LLM_order\": []\n",
    "}\n",
    "\n",
    "def evaluate_one_level(df, alpha, variant=\"orig\"):\n",
    "    # 1) skeleton & collider orientation\n",
    "    skel, sep_sets, pval = build_skeleton_CI(\n",
    "        data=df,\n",
    "        variant=variant,\n",
    "        ci_test=\"chi_square\",\n",
    "        significance_level=alpha,\n",
    "        max_cond_vars=3,\n",
    "        expert_knowledge=None,\n",
    "        enforce_expert_knowledge=False,\n",
    "        n_jobs=-1,\n",
    "        show_progress=False,\n",
    "        data_description=dataset_description,\n",
    "        node_description=variable_description,\n",
    "    )\n",
    "    pdag_colliders = orient_colliders(skel, sep_sets)\n",
    "    pdag = pdag_colliders.apply_meeks_rules(apply_r4=False)\n",
    "    pdag.add_nodes_from(set(df.columns) - set(pdag.nodes()))\n",
    "\n",
    "    cache_path = f\"vote_dir/{dsname}_{alpha}_{variant}.csv\"\n",
    "    cache_file = Path(cache_path)\n",
    "    if cache_file.exists():\n",
    "        try:\n",
    "            df_vote_dir = pd.read_csv(cache_path)\n",
    "            vote_dir = {\n",
    "                (row[\"Source\"], row[\"Target\"], row[\"Direction\"]): row[\"Votes\"]\n",
    "                for _, row in df_vote_dir.iterrows()\n",
    "            }\n",
    "        except Exception:\n",
    "            print(f'Fail Reading {cache_path}')\n",
    "    else: \n",
    "        # 2) LLM-based pair-wise orientation\n",
    "        vote_dir = vote_edges(\n",
    "            dataset_description, variable_description,\n",
    "            skel, sep_sets, pval, n_votes=N_VOTE, temperature=TEMP, model=LLM_MODEL\n",
    "        )\n",
    "        df_vote_dir = pd.DataFrame(\n",
    "            [(src, tgt, direction, count) for (src, tgt, direction), count in vote_dir.items()],\n",
    "            columns=[\"Source\", \"Target\", \"Direction\", \"Votes\"]\n",
    "        )\n",
    "        df_vote_dir.to_csv(cache_path, index=False)\n",
    "    directed_voted, undirected_voted = edges_from_votes(skel.edges(), vote_dir)\n",
    "    G_oriented = orient_edges(skel, directed_voted, undirected_voted, sep_sets, pval)\n",
    "    \n",
    "    pdag_Meek = pdag.copy()\n",
    "    for (u, v) in directed_voted:\n",
    "        if pdag_Meek.has_undirected_edge(u, v):\n",
    "            pdag_Meek.orient_undirected_edge(u, v, inplace=True)\n",
    "        elif pdag_Meek.has_undirected_edge(v, u):\n",
    "            pdag_Meek.orient_undirected_edge(v, u, inplace=True)\n",
    "\n",
    "    # 3) compute metrics for the raw PDAGs\n",
    "    pc_dir,  pc_und  = split_directed_undirected_edges(pdag)\n",
    "    llm_dir, llm_und = split_directed_undirected_edges(G_oriented)\n",
    "    meek_dir,meek_und  = split_directed_undirected_edges(pdag_Meek)\n",
    "\n",
    "    acc_pc  = accuracy_metrics(GroundTruth, pc_dir,  pc_und)\n",
    "    acc_llm = accuracy_metrics(GroundTruth, llm_dir, llm_und)\n",
    "    acc_meek= accuracy_metrics(GroundTruth, meek_dir, meek_und)\n",
    "    print('PC', acc_pc)\n",
    "    print('Ours', acc_llm)\n",
    "    print('Meek2013', acc_meek)\n",
    "    \n",
    "    # 4) apply the vote-based causal ordering, then metrics again\n",
    "    G_pc_ord  = order_PC(pdag,       vote_dir)\n",
    "    G_llm_ord = order_PC(G_oriented, vote_dir)\n",
    "    G_meek_ord= order_PC(pdag_Meek, vote_dir)\n",
    "    G_v_ord = order_PC(pdag_colliders, vote_dir)\n",
    "    G_skel_ord= order_PC(skel, vote_dir)\n",
    "\n",
    "    pcord_dir,  pcord_und  = split_directed_undirected_edges(G_pc_ord)\n",
    "    llmord_dir, llmord_und = split_directed_undirected_edges(G_llm_ord)\n",
    "    meekord_dir,meekord_und= split_directed_undirected_edges(G_meek_ord)\n",
    "    vord_dir, vord_und = split_directed_undirected_edges(G_v_ord)\n",
    "    skelord_dir,skelord_und= split_directed_undirected_edges(G_skel_ord)\n",
    "\n",
    "    acc_pc_ord  = accuracy_metrics(GroundTruth, pcord_dir,  pcord_und)\n",
    "    acc_llm_ord = accuracy_metrics(GroundTruth, llmord_dir, llmord_und)\n",
    "    acc_meek_ord= accuracy_metrics(GroundTruth, meekord_dir, meekord_und)\n",
    "    acc_v_ord = accuracy_metrics(GroundTruth, vord_dir, vord_und)\n",
    "    acc_skel_ord= accuracy_metrics(GroundTruth, skelord_dir, skelord_und)\n",
    "    print('PC + LLM order', acc_pc_ord)\n",
    "    print('Ours + LLM order', acc_llm_ord)\n",
    "    print('Meek2013 + LLM order', acc_meek_ord)\n",
    "    print('Collider + LLM order', acc_v_ord)\n",
    "    print('Skeleton + LLM order', acc_skel_ord)\n",
    "\n",
    "    # add α so it’s easy to filter later\n",
    "    for d in (acc_pc, acc_llm, acc_pc_ord, acc_llm_ord):\n",
    "        d[\"significance_level\"] = alpha\n",
    "\n",
    "    return acc_pc, acc_llm, acc_pc_ord, acc_llm_ord\n",
    "\n",
    "# --- MAIN LOOP ---------------------------------------------------------------\n",
    "logging.disable(logging.CRITICAL)\n",
    "\n",
    "for alpha in significance_levels:\n",
    "    print(f\"α = {alpha}\")\n",
    "    for variant in ['orig', 'stable', 'conservative']:\n",
    "        print(variant)\n",
    "        pc, llm, pc_ord, llm_ord = evaluate_one_level(df, alpha, variant)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ba293826",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/net/dali/home/mscbio/rul98/miniconda3/envs/ConstraintLLM/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "α = 1e-05\n",
      "orig\n",
      "PC {'precision': 0.8153846153846154, 'recall': 0.5, 'f1_score': 0.6198830409356725, 'true_positives': 26.5, 'false_positives': 6, 'false_negatives': 26.5, 'shd': 26.5, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.9722222222222222, 'recall': 0.7608695652173914, 'f1_score': 0.853658536585366, 'true_positives': 35, 'false_positives': 1, 'false_negatives': 11, 'shd': 11.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.8055555555555556, 'recall': 0.6304347826086957, 'f1_score': 0.7073170731707318, 'true_positives': 29, 'false_positives': 7, 'false_negatives': 17, 'shd': 17.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.75, 'recall': 0.5869565217391305, 'f1_score': 0.6585365853658538, 'true_positives': 27, 'false_positives': 9, 'false_negatives': 19, 'shd': 19.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.9722222222222222, 'recall': 0.7608695652173914, 'f1_score': 0.853658536585366, 'true_positives': 35, 'false_positives': 1, 'false_negatives': 11, 'shd': 11.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.8055555555555556, 'recall': 0.6304347826086957, 'f1_score': 0.7073170731707318, 'true_positives': 29, 'false_positives': 7, 'false_negatives': 17, 'shd': 17.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.75, 'recall': 0.5869565217391305, 'f1_score': 0.6585365853658538, 'true_positives': 27, 'false_positives': 9, 'false_negatives': 19, 'shd': 19.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.3611111111111111, 'recall': 0.2826086956521739, 'f1_score': 0.3170731707317073, 'true_positives': 13, 'false_positives': 23, 'false_negatives': 33, 'shd': 33.0, 'num_inferred_edges': 36, 'num_truth_edges': 46}\n",
      "stable\n",
      "PC {'precision': 0.84375, 'recall': 0.5192307692307693, 'f1_score': 0.6428571428571429, 'true_positives': 27.0, 'false_positives': 5, 'false_negatives': 25.0, 'shd': 25.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Ours {'precision': 1.0, 'recall': 0.7608695652173914, 'f1_score': 0.8641975308641976, 'true_positives': 35, 'false_positives': 0, 'false_negatives': 11, 'shd': 11.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.8571428571428571, 'recall': 0.6521739130434783, 'f1_score': 0.7407407407407407, 'true_positives': 30, 'false_positives': 5, 'false_negatives': 16, 'shd': 16.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.8285714285714286, 'recall': 0.6304347826086957, 'f1_score': 0.7160493827160495, 'true_positives': 29, 'false_positives': 6, 'false_negatives': 17, 'shd': 17.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 1.0, 'recall': 0.7608695652173914, 'f1_score': 0.8641975308641976, 'true_positives': 35, 'false_positives': 0, 'false_negatives': 11, 'shd': 11.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.8571428571428571, 'recall': 0.6521739130434783, 'f1_score': 0.7407407407407407, 'true_positives': 30, 'false_positives': 5, 'false_negatives': 16, 'shd': 16.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.8285714285714286, 'recall': 0.6304347826086957, 'f1_score': 0.7160493827160495, 'true_positives': 29, 'false_positives': 6, 'false_negatives': 17, 'shd': 17.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.37142857142857144, 'recall': 0.2826086956521739, 'f1_score': 0.32098765432098764, 'true_positives': 13, 'false_positives': 22, 'false_negatives': 33, 'shd': 33.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "conservative\n",
      "PC {'precision': 0.78125, 'recall': 0.4807692307692308, 'f1_score': 0.5952380952380952, 'true_positives': 25.0, 'false_positives': 7, 'false_negatives': 27.0, 'shd': 27.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.9714285714285714, 'recall': 0.7391304347826086, 'f1_score': 0.8395061728395062, 'true_positives': 34, 'false_positives': 1, 'false_negatives': 12, 'shd': 12.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.8, 'recall': 0.6086956521739131, 'f1_score': 0.6913580246913581, 'true_positives': 28, 'false_positives': 7, 'false_negatives': 18, 'shd': 18.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.7428571428571429, 'recall': 0.5652173913043478, 'f1_score': 0.6419753086419753, 'true_positives': 26, 'false_positives': 9, 'false_negatives': 20, 'shd': 20.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.9714285714285714, 'recall': 0.7391304347826086, 'f1_score': 0.8395061728395062, 'true_positives': 34, 'false_positives': 1, 'false_negatives': 12, 'shd': 12.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.8, 'recall': 0.6086956521739131, 'f1_score': 0.6913580246913581, 'true_positives': 28, 'false_positives': 7, 'false_negatives': 18, 'shd': 18.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.7428571428571429, 'recall': 0.5652173913043478, 'f1_score': 0.6419753086419753, 'true_positives': 26, 'false_positives': 9, 'false_negatives': 20, 'shd': 20.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.37142857142857144, 'recall': 0.2826086956521739, 'f1_score': 0.32098765432098764, 'true_positives': 13, 'false_positives': 22, 'false_negatives': 33, 'shd': 33.0, 'num_inferred_edges': 35, 'num_truth_edges': 46}\n",
      "α = 0.01\n",
      "orig\n",
      "PC {'precision': 0.717948717948718, 'recall': 0.56, 'f1_score': 0.6292134831460674, 'true_positives': 28.0, 'false_positives': 11, 'false_negatives': 22.0, 'shd': 23.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.9512195121951219, 'recall': 0.8478260869565217, 'f1_score': 0.8965517241379309, 'true_positives': 39, 'false_positives': 2, 'false_negatives': 7, 'shd': 8.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.7317073170731707, 'recall': 0.6521739130434783, 'f1_score': 0.6896551724137931, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.7317073170731707, 'recall': 0.6521739130434783, 'f1_score': 0.6896551724137931, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.9512195121951219, 'recall': 0.8478260869565217, 'f1_score': 0.8965517241379309, 'true_positives': 39, 'false_positives': 2, 'false_negatives': 7, 'shd': 8.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.7317073170731707, 'recall': 0.6521739130434783, 'f1_score': 0.6896551724137931, 'true_positives': 30, 'false_positives': 11, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.7804878048780488, 'recall': 0.6956521739130435, 'f1_score': 0.7356321839080461, 'true_positives': 32, 'false_positives': 9, 'false_negatives': 14, 'shd': 15.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.3170731707317073, 'recall': 0.2826086956521739, 'f1_score': 0.29885057471264365, 'true_positives': 13, 'false_positives': 28, 'false_negatives': 33, 'shd': 34.0, 'num_inferred_edges': 41, 'num_truth_edges': 46}\n",
      "stable\n",
      "PC {'precision': 0.8108108108108109, 'recall': 0.6, 'f1_score': 0.6896551724137931, 'true_positives': 30.0, 'false_positives': 7, 'false_negatives': 20.0, 'shd': 21.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.9487179487179487, 'recall': 0.8043478260869565, 'f1_score': 0.8705882352941177, 'true_positives': 37, 'false_positives': 2, 'false_negatives': 9, 'shd': 10.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.7948717948717948, 'recall': 0.6739130434782609, 'f1_score': 0.7294117647058824, 'true_positives': 31, 'false_positives': 8, 'false_negatives': 15, 'shd': 16.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.7948717948717948, 'recall': 0.6739130434782609, 'f1_score': 0.7294117647058824, 'true_positives': 31, 'false_positives': 8, 'false_negatives': 15, 'shd': 16.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.9487179487179487, 'recall': 0.8043478260869565, 'f1_score': 0.8705882352941177, 'true_positives': 37, 'false_positives': 2, 'false_negatives': 9, 'shd': 10.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.7948717948717948, 'recall': 0.6739130434782609, 'f1_score': 0.7294117647058824, 'true_positives': 31, 'false_positives': 8, 'false_negatives': 15, 'shd': 16.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.8205128205128205, 'recall': 0.6956521739130435, 'f1_score': 0.7529411764705882, 'true_positives': 32, 'false_positives': 7, 'false_negatives': 14, 'shd': 15.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.3333333333333333, 'recall': 0.2826086956521739, 'f1_score': 0.30588235294117644, 'true_positives': 13, 'false_positives': 26, 'false_negatives': 33, 'shd': 34.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "conservative\n",
      "PC {'precision': 0.7567567567567568, 'recall': 0.56, 'f1_score': 0.6436781609195402, 'true_positives': 28.0, 'false_positives': 9, 'false_negatives': 22.0, 'shd': 23.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.9487179487179487, 'recall': 0.8043478260869565, 'f1_score': 0.8705882352941177, 'true_positives': 37, 'false_positives': 2, 'false_negatives': 9, 'shd': 10.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.7692307692307693, 'recall': 0.6521739130434783, 'f1_score': 0.7058823529411764, 'true_positives': 30, 'false_positives': 9, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.7692307692307693, 'recall': 0.6521739130434783, 'f1_score': 0.7058823529411764, 'true_positives': 30, 'false_positives': 9, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.9487179487179487, 'recall': 0.8043478260869565, 'f1_score': 0.8705882352941177, 'true_positives': 37, 'false_positives': 2, 'false_negatives': 9, 'shd': 10.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.7692307692307693, 'recall': 0.6521739130434783, 'f1_score': 0.7058823529411764, 'true_positives': 30, 'false_positives': 9, 'false_negatives': 16, 'shd': 17.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.7948717948717948, 'recall': 0.6739130434782609, 'f1_score': 0.7294117647058824, 'true_positives': 31, 'false_positives': 8, 'false_negatives': 15, 'shd': 16.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.3333333333333333, 'recall': 0.2826086956521739, 'f1_score': 0.30588235294117644, 'true_positives': 13, 'false_positives': 26, 'false_negatives': 33, 'shd': 34.0, 'num_inferred_edges': 39, 'num_truth_edges': 46}\n",
      "α = 0.5\n",
      "orig\n",
      "PC {'precision': 0.36134453781512604, 'recall': 0.4056603773584906, 'f1_score': 0.38222222222222224, 'true_positives': 21.5, 'false_positives': 38, 'false_negatives': 31.5, 'shd': 54.5, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.6190476190476191, 'recall': 0.8478260869565217, 'f1_score': 0.7155963302752294, 'true_positives': 39, 'false_positives': 24, 'false_negatives': 7, 'shd': 30.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.3968253968253968, 'recall': 0.5434782608695652, 'f1_score': 0.45871559633027525, 'true_positives': 25, 'false_positives': 38, 'false_negatives': 21, 'shd': 44.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.38095238095238093, 'recall': 0.5217391304347826, 'f1_score': 0.4403669724770642, 'true_positives': 24, 'false_positives': 39, 'false_negatives': 22, 'shd': 45.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.6190476190476191, 'recall': 0.8478260869565217, 'f1_score': 0.7155963302752294, 'true_positives': 39, 'false_positives': 24, 'false_negatives': 7, 'shd': 30.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.3968253968253968, 'recall': 0.5434782608695652, 'f1_score': 0.45871559633027525, 'true_positives': 25, 'false_positives': 38, 'false_negatives': 21, 'shd': 44.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.3968253968253968, 'recall': 0.5434782608695652, 'f1_score': 0.45871559633027525, 'true_positives': 25, 'false_positives': 38, 'false_negatives': 21, 'shd': 44.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.20634920634920634, 'recall': 0.2826086956521739, 'f1_score': 0.2385321100917431, 'true_positives': 13, 'false_positives': 50, 'false_negatives': 33, 'shd': 56.0, 'num_inferred_edges': 63, 'num_truth_edges': 46}\n",
      "stable\n",
      "PC {'precision': 0.6590909090909091, 'recall': 0.58, 'f1_score': 0.6170212765957446, 'true_positives': 29.0, 'false_positives': 15, 'false_negatives': 21.0, 'shd': 29.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.8260869565217391, 'recall': 0.8260869565217391, 'f1_score': 0.8260869565217391, 'true_positives': 38, 'false_positives': 8, 'false_negatives': 8, 'shd': 16.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.8260869565217391, 'recall': 0.8260869565217391, 'f1_score': 0.8260869565217391, 'true_positives': 38, 'false_positives': 8, 'false_negatives': 8, 'shd': 16.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.2608695652173913, 'recall': 0.2608695652173913, 'f1_score': 0.2608695652173913, 'true_positives': 12, 'false_positives': 34, 'false_negatives': 34, 'shd': 42.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "conservative\n",
      "PC {'precision': 0.6590909090909091, 'recall': 0.58, 'f1_score': 0.6170212765957446, 'true_positives': 29.0, 'false_positives': 15, 'false_negatives': 21.0, 'shd': 29.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Ours {'precision': 0.8260869565217391, 'recall': 0.8260869565217391, 'f1_score': 0.8260869565217391, 'true_positives': 38, 'false_positives': 8, 'false_negatives': 8, 'shd': 16.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Meek2013 {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "PC + LLM order {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Ours + LLM order {'precision': 0.8260869565217391, 'recall': 0.8260869565217391, 'f1_score': 0.8260869565217391, 'true_positives': 38, 'false_positives': 8, 'false_negatives': 8, 'shd': 16.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Meek2013 + LLM order {'precision': 0.6739130434782609, 'recall': 0.6739130434782609, 'f1_score': 0.6739130434782609, 'true_positives': 31, 'false_positives': 15, 'false_negatives': 15, 'shd': 23.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Collider + LLM order {'precision': 0.6956521739130435, 'recall': 0.6956521739130435, 'f1_score': 0.6956521739130435, 'true_positives': 32, 'false_positives': 14, 'false_negatives': 14, 'shd': 22.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n",
      "Skeleton + LLM order {'precision': 0.2608695652173913, 'recall': 0.2608695652173913, 'f1_score': 0.2608695652173913, 'true_positives': 12, 'false_positives': 34, 'false_negatives': 34, 'shd': 42.0, 'num_inferred_edges': 46, 'num_truth_edges': 46}\n"
     ]
    }
   ],
   "source": [
    "from data.datasets import fetch_with_metadata\n",
    "dsname = 'mildew'\n",
    "data = fetch_with_metadata(dsname)\n",
    "df, GroundTruth, pos_data = data['data']\n",
    "variable_description = data['variable_description']\n",
    "dataset_description = data['dataset_description']\n",
    "import pandas as pd\n",
    "import logging\n",
    "from pathlib import Path\n",
    "from utils import *\n",
    "# significance_levels = [1e-5, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 8e-1]\n",
    "significance_levels = [1e-5, 1e-2, 5e-1]\n",
    "N_VOTE = 5\n",
    "LLM_MODEL = \"gpt-4o-mini\"\n",
    "TEMP = 0.6\n",
    "# container for results\n",
    "results = {\n",
    "    \"PC\": [],\n",
    "    \"LLM\": [],\n",
    "    \"PC_order\": [],\n",
    "    \"LLM_order\": []\n",
    "}\n",
    "\n",
    "def evaluate_one_level(df, alpha, variant=\"orig\"):\n",
    "    # 1) skeleton & collider orientation\n",
    "    skel, sep_sets, pval = build_skeleton_CI(\n",
    "        data=df,\n",
    "        variant=variant,\n",
    "        ci_test=\"chi_square\",\n",
    "        significance_level=alpha,\n",
    "        max_cond_vars=3,\n",
    "        expert_knowledge=None,\n",
    "        enforce_expert_knowledge=False,\n",
    "        n_jobs=-1,\n",
    "        show_progress=False,\n",
    "        data_description=dataset_description,\n",
    "        node_description=variable_description,\n",
    "    )\n",
    "    pdag_colliders = orient_colliders(skel, sep_sets)\n",
    "    pdag = pdag_colliders.apply_meeks_rules(apply_r4=False)\n",
    "    pdag.add_nodes_from(set(df.columns) - set(pdag.nodes()))\n",
    "\n",
    "    cache_path = f\"vote_dir/{dsname}_{alpha}_{variant}.csv\"\n",
    "    cache_file = Path(cache_path)\n",
    "    if cache_file.exists():\n",
    "        try:\n",
    "            df_vote_dir = pd.read_csv(cache_path)\n",
    "            vote_dir = {\n",
    "                (row[\"Source\"], row[\"Target\"], row[\"Direction\"]): row[\"Votes\"]\n",
    "                for _, row in df_vote_dir.iterrows()\n",
    "            }\n",
    "        except Exception:\n",
    "            print(f'Fail Reading {cache_path}')\n",
    "    else: \n",
    "        # 2) LLM-based pair-wise orientation\n",
    "        vote_dir = vote_edges(\n",
    "            dataset_description, variable_description,\n",
    "            skel, sep_sets, pval, n_votes=N_VOTE, temperature=TEMP, model=LLM_MODEL\n",
    "        )\n",
    "        df_vote_dir = pd.DataFrame(\n",
    "            [(src, tgt, direction, count) for (src, tgt, direction), count in vote_dir.items()],\n",
    "            columns=[\"Source\", \"Target\", \"Direction\", \"Votes\"]\n",
    "        )\n",
    "        df_vote_dir.to_csv(cache_path, index=False)\n",
    "    directed_voted, undirected_voted = edges_from_votes(skel.edges(), vote_dir)\n",
    "    G_oriented = orient_edges(skel, directed_voted, undirected_voted, sep_sets, pval)\n",
    "    \n",
    "    pdag_Meek = pdag.copy()\n",
    "    for (u, v) in directed_voted:\n",
    "        if pdag_Meek.has_undirected_edge(u, v):\n",
    "            pdag_Meek.orient_undirected_edge(u, v, inplace=True)\n",
    "        elif pdag_Meek.has_undirected_edge(v, u):\n",
    "            pdag_Meek.orient_undirected_edge(v, u, inplace=True)\n",
    "\n",
    "    # 3) compute metrics for the raw PDAGs\n",
    "    pc_dir,  pc_und  = split_directed_undirected_edges(pdag)\n",
    "    llm_dir, llm_und = split_directed_undirected_edges(G_oriented)\n",
    "    meek_dir,meek_und  = split_directed_undirected_edges(pdag_Meek)\n",
    "\n",
    "    acc_pc  = accuracy_metrics(GroundTruth, pc_dir,  pc_und)\n",
    "    acc_llm = accuracy_metrics(GroundTruth, llm_dir, llm_und)\n",
    "    acc_meek= accuracy_metrics(GroundTruth, meek_dir, meek_und)\n",
    "    print('PC', acc_pc)\n",
    "    print('Ours', acc_llm)\n",
    "    print('Meek2013', acc_meek)\n",
    "    \n",
    "    # 4) apply the vote-based causal ordering, then metrics again\n",
    "    G_pc_ord  = order_PC(pdag,       vote_dir)\n",
    "    G_llm_ord = order_PC(G_oriented, vote_dir)\n",
    "    G_meek_ord= order_PC(pdag_Meek, vote_dir)\n",
    "    G_v_ord = order_PC(pdag_colliders, vote_dir)\n",
    "    G_skel_ord= order_PC(skel, vote_dir)\n",
    "\n",
    "    pcord_dir,  pcord_und  = split_directed_undirected_edges(G_pc_ord)\n",
    "    llmord_dir, llmord_und = split_directed_undirected_edges(G_llm_ord)\n",
    "    meekord_dir,meekord_und= split_directed_undirected_edges(G_meek_ord)\n",
    "    vord_dir, vord_und = split_directed_undirected_edges(G_v_ord)\n",
    "    skelord_dir,skelord_und= split_directed_undirected_edges(G_skel_ord)\n",
    "\n",
    "    acc_pc_ord  = accuracy_metrics(GroundTruth, pcord_dir,  pcord_und)\n",
    "    acc_llm_ord = accuracy_metrics(GroundTruth, llmord_dir, llmord_und)\n",
    "    acc_meek_ord= accuracy_metrics(GroundTruth, meekord_dir, meekord_und)\n",
    "    acc_v_ord = accuracy_metrics(GroundTruth, vord_dir, vord_und)\n",
    "    acc_skel_ord= accuracy_metrics(GroundTruth, skelord_dir, skelord_und)\n",
    "    print('PC + LLM order', acc_pc_ord)\n",
    "    print('Ours + LLM order', acc_llm_ord)\n",
    "    print('Meek2013 + LLM order', acc_meek_ord)\n",
    "    print('Collider + LLM order', acc_v_ord)\n",
    "    print('Skeleton + LLM order', acc_skel_ord)\n",
    "\n",
    "    # add α so it’s easy to filter later\n",
    "    for d in (acc_pc, acc_llm, acc_pc_ord, acc_llm_ord):\n",
    "        d[\"significance_level\"] = alpha\n",
    "\n",
    "    return acc_pc, acc_llm, acc_pc_ord, acc_llm_ord\n",
    "\n",
    "# --- MAIN LOOP ---------------------------------------------------------------\n",
    "logging.disable(logging.CRITICAL)\n",
    "\n",
    "for alpha in significance_levels:\n",
    "    print(f\"α = {alpha}\")\n",
    "    for variant in ['orig', 'stable', 'conservative']:\n",
    "        print(variant)\n",
    "        pc, llm, pc_ord, llm_ord = evaluate_one_level(df, alpha, variant)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d038ee61",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ConstraintLLM",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
