{
  "best node": {
    "overall_plan": "The crash arises because timestamps within each case were kept as a pandas Series and indexed with integer labels, which pandas interprets as label-based indexing; for non-RangeIndex groups, accessing ts[k] raises KeyError. I fix this by converting timestamps to a dense numpy array (ts = ... .to_numpy()), ensuring positional indexing. I also make the prefix generation loop explicitly stop at T-1 to guarantee the existence of a next event, preventing off-by-one mistakes. The rest of the pipeline remains intact: time-based split by case start, prefix features, a simple LSTM baseline, and required BPM metrics and saving.",
    "analysis": "",
    "metric": {
      "value": {
        "metric_names": [
          {
            "metric_name": "loss",
            "lower_is_better": true,
            "description": "Cross-entropy loss averaged over samples.",
            "data": [
              {
                "dataset_name": "BPI2012 tr",
                "final_value": 0.5148,
                "best_value": 0.5148
              },
              {
                "dataset_name": "BPI2012 dev",
                "final_value": 0.5073,
                "best_value": 0.5073
              },
              {
                "dataset_name": "BPI2012 ts",
                "final_value": 0.5355,
                "best_value": 0.5355
              },
              {
                "dataset_name": "BPI2017 tr",
                "final_value": 0.3607,
                "best_value": 0.3607
              },
              {
                "dataset_name": "BPI2017 dev",
                "final_value": 0.3756,
                "best_value": 0.3756
              },
              {
                "dataset_name": "BPI2017 ts",
                "final_value": 0.3877,
                "best_value": 0.3877
              },
              {
                "dataset_name": "ROAD tr",
                "final_value": 0.4662,
                "best_value": 0.4662
              },
              {
                "dataset_name": "ROAD dev",
                "final_value": 0.4274,
                "best_value": 0.4274
              },
              {
                "dataset_name": "ROAD ts",
                "final_value": 0.4833,
                "best_value": 0.4833
              }
            ]
          },
          {
            "metric_name": "accuracy",
            "lower_is_better": false,
            "description": "Top-1 classification accuracy.",
            "data": [
              {
                "dataset_name": "BPI2012 tr",
                "final_value": 0.7777,
                "best_value": 0.7777
              },
              {
                "dataset_name": "BPI2012 dev",
                "final_value": 0.7639,
                "best_value": 0.7639
              },
              {
                "dataset_name": "BPI2012 ts",
                "final_value": 0.7569,
                "best_value": 0.7569
              },
              {
                "dataset_name": "BPI2017 tr",
                "final_value": 0.8422,
                "best_value": 0.8422
              },
              {
                "dataset_name": "BPI2017 dev",
                "final_value": 0.8405,
                "best_value": 0.8405
              },
              {
                "dataset_name": "BPI2017 ts",
                "final_value": 0.8332,
                "best_value": 0.8332
              },
              {
                "dataset_name": "ROAD tr",
                "final_value": 0.7894,
                "best_value": 0.7894
              },
              {
                "dataset_name": "ROAD dev",
                "final_value": 0.8122,
                "best_value": 0.8122
              },
              {
                "dataset_name": "ROAD ts",
                "final_value": 0.802,
                "best_value": 0.802
              }
            ]
          },
          {
            "metric_name": "F1 score",
            "lower_is_better": false,
            "description": "Harmonic mean of precision and recall (macro/weighted not specified).",
            "data": [
              {
                "dataset_name": "BPI2012 tr",
                "final_value": 0.5609,
                "best_value": 0.5609
              },
              {
                "dataset_name": "BPI2012 dev",
                "final_value": 0.6007,
                "best_value": 0.6007
              },
              {
                "dataset_name": "BPI2012 ts",
                "final_value": 0.5872,
                "best_value": 0.5872
              },
              {
                "dataset_name": "BPI2017 tr",
                "final_value": 0.5721,
                "best_value": 0.5721
              },
              {
                "dataset_name": "BPI2017 dev",
                "final_value": 0.618,
                "best_value": 0.618
              },
              {
                "dataset_name": "BPI2017 ts",
                "final_value": 0.571,
                "best_value": 0.571
              },
              {
                "dataset_name": "ROAD tr",
                "final_value": 0.5395,
                "best_value": 0.5395
              },
              {
                "dataset_name": "ROAD dev",
                "final_value": 0.6664,
                "best_value": 0.6664
              },
              {
                "dataset_name": "ROAD ts",
                "final_value": 0.474,
                "best_value": 0.474
              }
            ]
          },
          {
            "metric_name": "top-3 accuracy",
            "lower_is_better": false,
            "description": "Fraction of samples where the correct label is among the top-3 predictions.",
            "data": [
              {
                "dataset_name": "BPI2012 tr",
                "final_value": 0.9868,
                "best_value": 0.9868
              },
              {
                "dataset_name": "BPI2012 dev",
                "final_value": 0.9861,
                "best_value": 0.9861
              },
              {
                "dataset_name": "BPI2012 ts",
                "final_value": 0.9874,
                "best_value": 0.9874
              },
              {
                "dataset_name": "BPI2017 tr",
                "final_value": 0.9941,
                "best_value": 0.9941
              },
              {
                "dataset_name": "BPI2017 dev",
                "final_value": 0.9928,
                "best_value": 0.9928
              },
              {
                "dataset_name": "BPI2017 ts",
                "final_value": 0.9906,
                "best_value": 0.9906
              },
              {
                "dataset_name": "ROAD tr",
                "final_value": 0.9986,
                "best_value": 0.9986
              },
              {
                "dataset_name": "ROAD dev",
                "final_value": 0.9969,
                "best_value": 0.9969
              },
              {
                "dataset_name": "ROAD ts",
                "final_value": 0.9936,
                "best_value": 0.9936
              }
            ]
          }
        ]
      },
      "maximize": null,
      "name": null,
      "description": null
    },
    "code": "import os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import f1_score, accuracy_score\nimport random\nimport math\nimport matplotlib.pyplot as plt\n\n# Device handling (required)\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n# Experiment data structure\nexperiment_data = {\n    \"BPI2012\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"BPI2017\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"ROAD\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n}\n\n# Data loading utilities (use provided helper)\nfrom ai_scientist.ideas.my_research_topic import load_datasets, pick_default_dataset\n\n\n# Reproducibility\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# Build prefixes\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    # Keep only 'complete' transitions if lifecycle exists\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        df = df[df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")]\n        if len(df) == 0:\n            df = df.copy()  # fallback if empty\n            df = df.sort_values([\"case_id\", \"timestamp\"])\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    # Build activity vocab\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}  # 0 for PAD\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        # Convert to numpy arrays for safe positional indexing\n        ts_ns = (\n            pd.to_datetime(g[\"timestamp\"], utc=True).astype(\"int64\").to_numpy()\n        )  # nanoseconds\n        ts = (ts_ns // 10**9).astype(np.int64)  # seconds as numpy array\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str).tolist()], dtype=np.int64\n        )\n        # simple calendar features\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        # time deltas and since start in seconds\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(\n            np.float32\n        )  # [T,5]\n        T = len(acts_ids)\n        if T < 2:\n            continue\n        # Generate prefixes of length k (min_prefix_len..min(max_prefix_len, T-1)); target = activity at position k\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            seq_acts = acts_ids[:k].tolist()\n            seq_feats = feats[:k]\n            target = int(acts_ids[k])\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": seq_acts,\n                    \"seq_feats\": seq_feats.copy(),\n                    \"target\": target,\n                    \"last_ts\": int(ts[k - 1]),\n                    \"next_ts\": int(ts[k]),\n                }\n            )\n\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n\n    # Collect feature normalization stats over all feats (initial; will be recomputed on train split)\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n    ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n    for s in samples:\n        if s[\"seq_feats\"].shape[0] > 0:\n            s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n            s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n    return samples, act2id, id2act, pad_id\n\n\n# Time-based split by case start time\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.zeros((pad_len, self.num_cont), dtype=np.float32)\n        feats_pad = np.vstack([feats_pad, feats.astype(np.float32)])\n        attn_mask = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad, dtype=torch.long),\n            \"feats\": torch.tensor(feats_pad, dtype=torch.float32),\n            \"mask\": torch.tensor(attn_mask, dtype=torch.float32),\n            \"y\": torch.tensor(s[\"target\"], dtype=torch.long),\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(\n        self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, num_layers=1, pad_idx=0\n    ):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim,\n            hidden_size=hidden,\n            batch_first=True,\n            num_layers=num_layers,\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)  # includes PAD index\n        self.pad_idx = pad_idx\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)  # [B,T,emb]\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h_last = h[-1]\n        h_last = self.dropout(h_last)\n        logits = self.fc(h_last)\n        return logits\n\n\ndef collate_fn(batch):\n    keys = batch[0].keys()\n    out = {k: torch.stack([b[k] for b in batch], dim=0) for k in keys}\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys, preds_top1, preds_probs = [], [], []\n    top3_correct = 0\n    n_total = 0\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk_idx = torch.topk(probs, k=k_val, dim=1)\n            ys.extend(batch[\"y\"].detach().cpu().tolist())\n            preds_top1.extend(top1.detach().cpu().tolist())\n            preds_probs.append(probs.detach().cpu().numpy())\n            # top-3 correctness\n            for i in range(batch[\"y\"].size(0)):\n                if batch[\"y\"][i].item() in topk_idx[i].detach().cpu().tolist():\n                    top3_correct += 1\n            n_total += batch[\"y\"].size(0)\n    avg_loss = total_loss / max(1, n_total)\n    y_true = np.array(ys)\n    y_pred = np.array(preds_top1)\n    mask = y_true != pad_idx\n    y_true = y_true[mask]\n    y_pred = y_pred[mask]\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except Exception:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n_total))\n    probs_concat = (\n        np.concatenate(preds_probs, axis=0)\n        if len(preds_probs) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return avg_loss, acc, f1, top3, y_true, y_pred, probs_concat\n\n\ndef train_one_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    # Time-based split\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    # Build samples across all to get vocab; we'll re-normalize with train stats\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    # Filter per split\n    samples_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    samples_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    samples_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # Recompute normalization using train samples only\n    if len(samples_train) > 0:\n        concat_feats = [\n            s[\"seq_feats\"] for s in samples_train if s[\"seq_feats\"].shape[0] > 0\n        ]\n        if len(concat_feats) > 0:\n            all_feats = np.concatenate(concat_feats, axis=0)\n            dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n            ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n\n            def norm_samples(samples):\n                for s in samples:\n                    if s[\"seq_feats\"].shape[0] > 0:\n                        s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                        s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n            norm_samples(samples_train)\n            norm_samples(samples_val)\n            norm_samples(samples_test)\n    print(\n        f\"Samples train/val/test: {len(samples_train)}/{len(samples_val)}/{len(samples_test)}; vocab={len(act2id)}\"\n    )\n    if len(samples_train) == 0 or len(act2id) < 2:\n        print(\"Not enough data to train. Skipping.\")\n        return\n    ds_train = PrefixDataset(\n        samples_train, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_val = PrefixDataset(\n        samples_val, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_test = PrefixDataset(\n        samples_test, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    dl_train = DataLoader(\n        ds_train,\n        batch_size=batch_size,\n        shuffle=True,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_val = DataLoader(\n        ds_val,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_test = DataLoader(\n        ds_test,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n\n    # Model\n    model = LSTMBaseline(\n        vocab_size=len(act2id),\n        emb_dim=64,\n        cont_dim=5,\n        hidden=128,\n        num_layers=1,\n        pad_idx=pad_id,\n    ).to(device)\n    criterion = nn.CrossEntropyLoss().to(device)\n    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n\n    # Training loop\n    best_val_top3 = -1.0\n    best_state = None\n    hist = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for epoch in range(1, max_epochs + 1):\n        model.train()\n        total = 0\n        running_loss = 0.0\n        for batch in dl_train:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            optimizer.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            loss.backward()\n            optimizer.step()\n            running_loss += loss.item() * logits.size(0)\n            total += logits.size(0)\n        train_loss = running_loss / max(1, total)\n        val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n            model, dl_val, criterion, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {epoch}: validation_loss = {val_loss:.4f} | val_acc={val_acc:.4f} | val_f1={val_f1:.4f} | val_top3={val_top3:.4f}\"\n        )\n        hist[\"train_loss\"].append(train_loss)\n        hist[\"val_loss\"].append(val_loss)\n        hist[\"val_top3\"].append(val_top3)\n        experiment_data[name][\"losses\"][\"train\"].append((epoch, train_loss))\n        experiment_data[name][\"losses\"][\"val\"].append((epoch, val_loss))\n        experiment_data[name][\"metrics\"][\"val\"].append(\n            (epoch, {\"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3})\n        )\n        experiment_data[name][\"epochs\"].append(epoch)\n        if val_top3 > best_val_top3:\n            best_val_top3 = val_top3\n            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}\n\n    # Load best\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n\n    # Final eval on train/val/test\n    train_loss, train_acc, train_f1, train_top3, _, _, _ = evaluate(\n        model, dl_train, criterion, device, len(act2id), pad_id\n    )\n    val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n        model, dl_val, criterion, device, len(act2id), pad_id\n    )\n    test_loss, test_acc, test_f1, test_top3, y_true_t, y_pred_t, probs_t = evaluate(\n        model, dl_test, criterion, device, len(act2id), pad_id\n    )\n    print(\n        f\"[{name}] Train: loss={train_loss:.4f} acc={train_acc:.4f} f1={train_f1:.4f} top3={train_top3:.4f}\"\n    )\n    print(\n        f\"[{name}] Test:  loss={test_loss:.4f} acc={test_acc:.4f} f1={test_f1:.4f} top3={test_top3:.4f}\"\n    )\n\n    # Save metrics\n    experiment_data[name][\"metrics\"][\"train\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": train_loss,\n                \"acc\": train_acc,\n                \"macro_f1\": train_f1,\n                \"top3\": train_top3,\n            },\n        )\n    )\n    experiment_data[name][\"metrics\"][\"val\"].append(\n        (\n            \"final\",\n            {\"loss\": val_loss, \"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3},\n        )\n    )\n    experiment_data[name][\"metrics\"][\"test\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": test_loss,\n                \"acc\": test_acc,\n                \"macro_f1\": test_f1,\n                \"top3\": test_top3,\n            },\n        )\n    )\n    experiment_data[name][\"predictions\"] = y_pred_t.tolist()\n    experiment_data[name][\"ground_truth\"] = y_true_t.tolist()\n\n    # Plots\n    try:\n        plt.figure()\n        plt.plot(hist[\"train_loss\"], label=\"train_loss\")\n        plt.plot(hist[\"val_loss\"], label=\"val_loss\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"loss_curves_{name}.png\"))\n        plt.close()\n\n        plt.figure()\n        plt.plot(hist[\"val_top3\"], label=\"val_top3\")\n        plt.legend()\n        plt.title(f\"Val Top-3 Acc - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Top-3 Acc\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"val_top3_{name}.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"[warn] Plotting failed: {e}\")\n\n    # Save confusion matrix-like data (optional)\n    try:\n        from sklearn.metrics import confusion_matrix\n\n        cm = confusion_matrix(y_true_t, y_pred_t)\n        np.save(os.path.join(working_dir, f\"cm_{name}.npy\"), cm)\n    except Exception as e:\n        print(f\"[warn] Confusion matrix failed: {e}\")\n\n\ndef main():\n    datasets = load_datasets()\n    # Loop through loaded datasets; cap to 5000 earliest cases for speed\n    for key, df in datasets.items():\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep_cases = set(starts.iloc[:5000][\"case_id\"])\n                df_small = df[df[\"case_id\"].isin(keep_cases)].copy()\n            else:\n                df_small = df\n        except Exception:\n            df_small = df\n        train_one_dataset(\n            key, df_small, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    np.savez_compressed(\n        os.path.join(working_dir, \"experiment_data_compressed.npz\"),\n        data=experiment_data,\n    )\n\n\n# Execute immediately\nmain()\n",
    "plot_code": "import matplotlib.pyplot as plt\nimport numpy as np\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import (\n    f1_score,\n    accuracy_score,\n    confusion_matrix,\n    precision_recall_curve,\n    average_precision_score,\n)\nfrom sklearn.preprocessing import label_binarize\nimport random\nimport math\nfrom collections import defaultdict\n\n# Reproducibility and device\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# --------- Data loading from local XES via pm4py ----------\ndef load_xes_folder(data_dir=\"data\"):\n    datasets = {}\n    try:\n        import pm4py\n    except Exception as e:\n        print(f\"pm4py not available: {e}\")\n        return datasets\n    if not os.path.isdir(data_dir):\n        print(f\"Data directory not found: {data_dir}\")\n        return datasets\n    for fn in os.listdir(data_dir):\n        if fn.lower().endswith(\".xes\") or fn.lower().endswith(\".xes.gz\"):\n            path = os.path.join(data_dir, fn)\n            try:\n                log = pm4py.read_xes(path)\n                df = pm4py.convert_to_dataframe(log)\n                # Standardize columns\n                # pm4py dataframe typically has case:concept:name, concept:name, time:timestamp, lifecycle:transition\n                cols = df.columns\n                case_col = (\n                    \"case:concept:name\"\n                    if \"case:concept:name\" in cols\n                    else (\"case\" if \"case\" in cols else None)\n                )\n                act_col = (\n                    \"concept:name\"\n                    if \"concept:name\" in cols\n                    else (\"activity\" if \"activity\" in cols else None)\n                )\n                ts_col = (\n                    \"time:timestamp\"\n                    if \"time:timestamp\" in cols\n                    else (\"timestamp\" if \"timestamp\" in cols else None)\n                )\n                life_col = (\n                    \"lifecycle:transition\"\n                    if \"lifecycle:transition\" in cols\n                    else (\"lifecycle\" if \"lifecycle\" in cols else None)\n                )\n                if case_col is None or act_col is None or ts_col is None:\n                    print(f\"Missing required columns in {fn}, skipping.\")\n                    continue\n                out = pd.DataFrame(\n                    {\n                        \"case_id\": df[case_col].astype(str).values,\n                        \"activity\": df[act_col].astype(str).values,\n                        \"timestamp\": pd.to_datetime(df[ts_col], utc=True),\n                    }\n                )\n                if life_col is not None:\n                    out[\"lifecycle\"] = df[life_col].astype(str).values\n                name = os.path.splitext(fn)[0]\n                datasets[name] = out\n                print(\n                    f\"Loaded {name}: {len(out)} events, {out['case_id'].nunique()} cases\"\n                )\n            except Exception as e:\n                print(f\"Failed to load {fn}: {e}\")\n    return datasets\n\n\n# --------- Prefix building and split ----------\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        mask = df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")\n        if mask.any():\n            df = df[mask]\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        if len(g) < 2:\n            continue\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        ts = (g_ts.astype(\"int64\") // 10**9).to_numpy(np.int64)\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str)], dtype=np.int64\n        )\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(np.float32)\n        T = len(acts_ids)\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": acts_ids[:k].tolist(),\n                    \"seq_feats\": feats[:k].copy(),\n                    \"target\": int(acts_ids[k]),\n                    \"prefix_len\": k,\n                }\n            )\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    for s in samples:\n        pass  # initial no norm; will norm on train split\n    return samples, act2id, id2act, pad_id\n\n\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.vstack(\n            [\n                np.zeros((pad_len, self.num_cont), dtype=np.float32),\n                feats.astype(np.float32),\n            ]\n        )\n        attn = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad).long(),\n            \"feats\": torch.tensor(feats_pad).float(),\n            \"mask\": torch.tensor(attn).float(),\n            \"y\": torch.tensor(s[\"target\"]).long(),\n            \"prefix_len\": L,\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, pad_idx=0):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim, hidden_size=hidden, batch_first=True\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h = self.dropout(h[-1])\n        return self.fc(h)\n\n\ndef collate_fn(batch):\n    out = {\n        k: (\n            torch.stack([b[k] for b in batch], 0)\n            if isinstance(batch[0][k], torch.Tensor)\n            else [b[k] for b in batch]\n        )\n        for k in batch[0].keys()\n    }\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys = []\n    yhat = []\n    probs_list = []\n    n = 0\n    top3_correct = 0\n    pref_lens = []\n    top3_flags = []\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk = torch.topk(probs, k=k_val, dim=1)\n            y = batch[\"y\"]\n            ys.extend(y.detach().cpu().tolist())\n            yhat.extend(top1.detach().cpu().tolist())\n            probs_list.append(probs.detach().cpu().numpy())\n            for i in range(y.size(0)):\n                flag = int(y[i].item() in topk[i].detach().cpu().tolist())\n                top3_correct += flag\n                top3_flags.append(flag)\n                pref_lens.append(int(batch[\"prefix_len\"][i].item()))\n            n += y.size(0)\n    avg_loss = total_loss / max(1, n)\n    y_true = np.array(ys)\n    y_pred = np.array(yhat)\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n))\n    probs_concat = (\n        np.concatenate(probs_list, axis=0)\n        if len(probs_list) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return (\n        avg_loss,\n        acc,\n        f1,\n        top3,\n        y_true,\n        y_pred,\n        probs_concat,\n        np.array(pref_lens),\n        np.array(top3_flags),\n    )\n\n\ndef train_on_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    s_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    s_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    s_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # normalize time features on train\n    if len(s_train) > 0:\n        feats = np.concatenate(\n            [s[\"seq_feats\"] for s in s_train if len(s[\"seq_feats\"]) > 0], axis=0\n        )\n        dt_mean, dt_std = feats[:, 0].mean(), feats[:, 0].std() + 1e-6\n        ss_mean, ss_std = feats[:, 1].mean(), feats[:, 1].std() + 1e-6\n\n        def norm(samples):\n            for s in samples:\n                if s[\"seq_feats\"].shape[0] > 0:\n                    s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                    s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n        norm(s_train)\n        norm(s_val)\n        norm(s_test)\n    print(\n        f\"Samples train/val/test: {len(s_train)}/{len(s_val)}/{len(s_test)}; vocab={len(act2id)}\"\n    )\n    if len(s_train) == 0 or len(act2id) < 2:\n        print(\"Insufficient data; skipping.\")\n        return None\n    ds_tr = PrefixDataset(s_train, pad_id, max_prefix_len, 5)\n    ds_va = PrefixDataset(s_val, pad_id, max_prefix_len, 5)\n    ds_te = PrefixDataset(s_test, pad_id, max_prefix_len, 5)\n    dl_tr = DataLoader(\n        ds_tr, batch_size=batch_size, shuffle=True, collate_fn=collate_fn\n    )\n    dl_va = DataLoader(\n        ds_va, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    dl_te = DataLoader(\n        ds_te, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    model = LSTMBaseline(\n        vocab_size=len(act2id), emb_dim=64, cont_dim=5, hidden=128, pad_idx=pad_id\n    ).to(device)\n    crit = nn.CrossEntropyLoss().to(device)\n    opt = torch.optim.Adam(model.parameters(), lr=lr)\n    best_top3 = -1.0\n    best_state = None\n    history = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for ep in range(1, max_epochs + 1):\n        model.train()\n        tot = 0\n        run_loss = 0.0\n        for batch in dl_tr:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            opt.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = crit(logits, batch[\"y\"])\n            loss.backward()\n            opt.step()\n            run_loss += loss.item() * logits.size(0)\n            tot += logits.size(0)\n        tr_loss = run_loss / max(1, tot)\n        va_loss, va_acc, va_f1, va_top3, *_ = evaluate(\n            model, dl_va, crit, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {ep}: val_loss={va_loss:.4f} acc={va_acc:.4f} f1={va_f1:.4f} top3={va_top3:.4f}\"\n        )\n        history[\"train_loss\"].append(tr_loss)\n        history[\"val_loss\"].append(va_loss)\n        history[\"val_top3\"].append(va_top3)\n        if va_top3 > best_top3:\n            best_top3 = va_top3\n            best_state = {\n                k: v.detach().cpu().clone() for k, v in model.state_dict().items()\n            }\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n    tr_loss, tr_acc, tr_f1, tr_top3, *_ = evaluate(\n        model, dl_tr, crit, device, len(act2id), pad_id\n    )\n    te_loss, te_acc, te_f1, te_top3, y_true, y_pred, probs, pref_lens, top3_flags = (\n        evaluate(model, dl_te, crit, device, len(act2id), pad_id)\n    )\n    print(\n        f\"[{name}] Test: loss={te_loss:.4f} acc={te_acc:.4f} f1={te_f1:.4f} top3={te_top3:.4f}\"\n    )\n    # package experiment data\n    exp = {\n        \"metrics\": {\n            \"train\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": tr_loss,\n                        \"acc\": tr_acc,\n                        \"macro_f1\": tr_f1,\n                        \"top3\": tr_top3,\n                    },\n                )\n            ],\n            \"val\": [],\n            \"test\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": te_loss,\n                        \"acc\": te_acc,\n                        \"macro_f1\": te_f1,\n                        \"top3\": te_top3,\n                    },\n                )\n            ],\n        },\n        \"losses\": {\n            \"train\": list(enumerate(history[\"train_loss\"], start=1)),\n            \"val\": list(enumerate(history[\"val_loss\"], start=1)),\n        },\n        \"predictions\": y_pred.tolist(),\n        \"ground_truth\": y_true.tolist(),\n        \"epochs\": list(range(1, len(history[\"train_loss\"]) + 1)),\n        \"probs\": probs,\n        \"prefix_lens\": pref_lens.tolist(),\n        \"top3_flags\": top3_flags.tolist(),\n        \"act2id\": act2id,\n    }\n    # plots for this dataset\n    try:\n        plt.figure()\n        plt.plot(history[\"train_loss\"], label=\"train\")\n        plt.plot(history[\"val_loss\"], label=\"val\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating loss curves for {name}: {e}\")\n        plt.close()\n    try:\n        cm = confusion_matrix(y_true, y_pred)\n        plt.figure(figsize=(5, 4))\n        plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n        plt.colorbar()\n        plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n        plt.xlabel(\"Predicted\")\n        plt.ylabel(\"True\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_confusion_matrix.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating confusion matrix for {name}: {e}\")\n        plt.close()\n    try:\n        # Top-3 accuracy vs prefix length\n        if len(pref_lens) > 0:\n            d = defaultdict(list)\n            for L, flag in zip(pref_lens, top3_flags):\n                d[int(L)].append(int(flag))\n            xs = sorted(d.keys())\n            ys = [np.mean(d[k]) for k in xs]\n            plt.figure()\n            plt.plot(xs, ys, marker=\"o\")\n            plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n            plt.xlabel(\"Prefix Length\")\n            plt.ylabel(\"Top-3 Accuracy\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_top3_vs_prefixlen.png\"))\n            plt.close()\n    except Exception as e:\n        print(f\"Error creating Top-3 vs prefix length for {name}: {e}\")\n        plt.close()\n    try:\n        # Macro PR curve (one-vs-rest); may be coarse due to many classes\n        if probs.shape[0] > 0:\n            classes = np.unique(y_true)\n            Y = label_binarize(y_true, classes=range(probs.shape[1]))\n            # only keep columns present in classes to avoid PAD\n            present = [c for c in classes]\n            if len(present) > 1:\n                precisions = []\n                recalls = []\n                aps = []\n                for c in present:\n                    p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                    ap = average_precision_score(Y[:, c], probs[:, c])\n                    precisions.append(\n                        np.interp(np.linspace(0, 1, 101), r[::-1], p[::-1])\n                    )\n                    recalls.append(np.linspace(0, 1, 101))\n                    aps.append(ap)\n                macro_p = np.mean(np.stack(precisions, 0), 0)\n                macro_r = np.linspace(0, 1, 101)\n                plt.figure()\n                plt.plot(macro_r, macro_p, label=f\"Macro-PR (mAP={np.mean(aps):.3f})\")\n                plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Recall\")\n                plt.ylabel(\"Precision\")\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(os.path.join(working_dir, f\"{name}_macro_pr.png\"))\n                plt.close()\n    except Exception as e:\n        print(f\"Error creating PR curve for {name}: {e}\")\n        plt.close()\n    return name, exp\n\n\ndef main():\n    datasets = load_xes_folder(data_dir=os.path.join(os.getcwd(), \"data\"))\n    experiment_data = {}\n    for name, df in datasets.items():\n        # optional cap earliest 5000 cases\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep = set(starts.iloc[:5000][\"case_id\"])\n                df = df[df[\"case_id\"].isin(keep)].copy()\n        except:\n            pass\n        res = train_on_dataset(\n            name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n        if res is not None:\n            k, exp = res\n            experiment_data[k] = exp\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    # Print evaluation metrics\n    for k, v in experiment_data.items():\n        test_metrics = dict(v[\"metrics\"][\"test\"][0][1])\n        print(\n            f\"{k} | Test acc={test_metrics['acc']:.4f} macro_f1={test_metrics['macro_f1']:.4f} top3={test_metrics['top3']:.4f} loss={test_metrics['loss']:.4f}\"\n        )\n\n    # Secondary plotting pass strictly from experiment_data.npy (as required)\n    try:\n        experiment_data_loaded = np.load(\n            os.path.join(working_dir, \"experiment_data.npy\"), allow_pickle=True\n        ).item()\n    except Exception as e:\n        print(f\"Error loading experiment data: {e}\")\n        experiment_data_loaded = {}\n    for name, ed in experiment_data_loaded.items():\n        try:\n            # re-plot loss curves from saved data\n            plt.figure()\n            tl = [y for (_, y) in ed.get(\"losses\", {}).get(\"train\", [])]\n            vl = [y for (_, y) in ed.get(\"losses\", {}).get(\"val\", [])]\n            if len(tl) > 0:\n                plt.plot(tl, label=\"train\")\n            if len(vl) > 0:\n                plt.plot(vl, label=\"val\")\n            plt.legend()\n            plt.title(f\"Loss Curves - {name}\\nNext-activity\")\n            plt.xlabel(\"Epoch\")\n            plt.ylabel(\"Loss\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves_reload.png\"))\n            plt.close()\n        except Exception as e:\n            print(f\"Error creating plot1: {e}\")\n            plt.close()\n        try:\n            # confusion matrix from predictions and ground truth\n            y_true = ed.get(\"ground_truth\", [])\n            y_pred = ed.get(\"predictions\", [])\n            if len(y_true) > 0 and len(y_pred) > 0:\n                cm = confusion_matrix(y_true, y_pred)\n                plt.figure(figsize=(5, 4))\n                plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n                plt.colorbar()\n                plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Predicted\")\n                plt.ylabel(\"True\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_confusion_matrix_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot2: {e}\")\n            plt.close()\n        try:\n            # Top-3 vs prefix length if present\n            pref = ed.get(\"prefix_lens\", [])\n            flags = ed.get(\"top3_flags\", [])\n            if len(pref) > 0 and len(flags) > 0:\n                d = defaultdict(list)\n                for L, f in zip(pref, flags):\n                    d[int(L)].append(int(f))\n                xs = sorted(d.keys())\n                ys = [float(np.mean(d[x])) for x in xs]\n                plt.figure()\n                plt.plot(xs, ys, marker=\"o\")\n                plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n                plt.xlabel(\"Prefix Length\")\n                plt.ylabel(\"Top-3 Accuracy\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_top3_vs_prefixlen_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot3: {e}\")\n            plt.close()\n        try:\n            # Macro PR curve if probs available\n            probs = np.array(ed.get(\"probs\", []))\n            y_true = ed.get(\"ground_truth\", [])\n            if probs.size > 0 and len(y_true) > 0:\n                classes = sorted(set(y_true))\n                Y = label_binarize(np.array(y_true), classes=range(probs.shape[1]))\n                present = classes\n                if len(present) > 1:\n                    precisions = []\n                    aps = []\n                    grid = np.linspace(0, 1, 101)\n                    for c in present:\n                        p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                        precisions.append(np.interp(grid, r[::-1], p[::-1]))\n                        aps.append(average_precision_score(Y[:, c], probs[:, c]))\n                    macro_p = np.mean(np.stack(precisions, 0), 0)\n                    plt.figure()\n                    plt.plot(grid, macro_p, label=f\"mAP={np.mean(aps):.3f}\")\n                    plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                    plt.xlabel(\"Recall\")\n                    plt.ylabel(\"Precision\")\n                    plt.legend()\n                    plt.tight_layout()\n                    plt.savefig(\n                        os.path.join(working_dir, f\"{name}_macro_pr_reload.png\")\n                    )\n                    plt.close()\n        except Exception as e:\n            print(f\"Error creating plot4: {e}\")\n            plt.close()\n\n\nif __name__ == \"__main__\":\n    main()\n",
    "plot_plan": "We tackle next-activity prediction from local XES logs using pm4py, building time-ordered prefixes and splitting cases by start time into train/val/test. Activities are tokenized and simple temporal/calendar features are added; a small LSTM consumes past activities plus features to predict the next activity. We train for a few epochs, select the best model by validation Top-3 accuracy, then evaluate on test with accuracy, macro-F1, Top-3, and confusion matrix; we also keep per-sample prefix lengths to compute Top-3 vs prefix length. We aggregate results into an experiment_data object and save it. Finally, we generate standard plots only from the saved experiment_data: loss curves, confusion matrix, macro-PR if probabilities exist, and Top-3 vs prefix length. We ensure each plot is guarded with try-except and saved to working_dir, closing figures after saving. The code uses pm4py to load any local .xes files, avoids external datasets, performs time-based splits, and reports BPM metrics.",
    "plot_analyses": [
      {
        "analysis": "The validation top-3 accuracy for the BPI2017 dataset shows a consistent increase over the epochs, indicating that the model is effectively learning to predict the top-3 next activities over time. The accuracy surpasses 0.9925, which suggests strong predictive performance.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_BPI2017.png"
      },
      {
        "analysis": "The validation top-3 accuracy for the BPI2012 dataset starts strong, reaching above 0.9860 early in training, but shows a slight decline towards the end of the epochs. This could indicate overfitting or a plateau in learning efficiency, requiring potential adjustments in model complexity or training duration.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_BPI2012.png"
      },
      {
        "analysis": "The loss curves for the ROAD dataset show a clear and rapid decline in both training and validation loss. The validation loss stabilizes close to the training loss, indicating good generalization without significant overfitting. The model appears to be well-calibrated for this dataset.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_ROAD.png"
      },
      {
        "analysis": "The loss curves for the BPI2017 dataset demonstrate a sharp decline in both training and validation loss, with the validation loss closely following the training loss. This suggests effective learning and good model generalization with minimal overfitting.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_BPI2017.png"
      },
      {
        "analysis": "The validation top-3 accuracy for the ROAD dataset reaches a high value early on and remains stable throughout the epochs, indicating strong model performance and consistency in predicting the top-3 next activities. The accuracy remains nearly constant at around 0.997, reflecting excellent predictive capabilities.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_ROAD.png"
      },
      {
        "analysis": "The loss curves for the BPI2012 dataset show a significant decrease in both training and validation loss with epochs, indicating effective learning. The validation loss is slightly higher than the training loss towards the end, suggesting minor overfitting that may be rectified with techniques such as dropout or regularization.",
        "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_BPI2012.png"
      }
    ],
    "plot_paths": [
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_BPI2017.png",
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_BPI2012.png",
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_ROAD.png",
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_BPI2017.png",
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/val_top3_ROAD.png",
      "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/loss_curves_BPI2012.png"
    ],
    "vlm_feedback_summary": "The experimental results demonstrate strong predictive performance across different datasets, particularly in top-3 accuracy metrics. While the ROAD dataset shows excellent consistency and high accuracy, the BPI2012 dataset indicates potential overfitting concerns. Loss curves across datasets suggest good generalization capabilities, with minor overfitting observed in the BPI2012 dataset. Further refinement may be needed in model tuning and complexity to address these issues.",
    "exp_results_dir": "experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087",
    "exp_results_npy_files": [
      "experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/cm_BPI2012.npy",
      "experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/cm_ROAD.npy",
      "experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/experiment_data.npy",
      "experiment_results/experiment_726b2721d45c4800b9381c5d265fefcb_proc_332087/cm_BPI2017.npy"
    ]
  },
  "best node with different seeds": [
    {
      "overall_plan": "Establish and stabilize a baseline Predictive Process Monitoring pipeline centered on an LSTM model by fixing critical data handling issues and maintaining the existing experimental design. The parent plan resolves a crash stemming from pandas Series integer indexing in per-case timestamps by converting timestamps to dense numpy arrays for positional indexing and constrains the prefix generation loop to T-1 to avoid off-by-one errors and ensure the existence of a next event. The remainder of the pipeline remains intact: time-based splitting by case start, prefix feature construction, training a simple LSTM baseline, computing required BPM/PPM metrics, and saving results. The current plan is a seed node and does not add new methodological changes, so the overarching objective continues to be a reliable, crash-free baseline suitable for consistent evaluation and future extensions.",
      "analysis": "Overall run completed successfully with valid time-based splits, per-log evaluation on BPI 2012/2017 and ROAD, and required next-activity metrics (acc/macro-F1/top-3). However, there is a latent bug in build_prefix_dataset regarding lifecycle filtering fallback: df is filtered to lifecycle=='complete' and if that results in an empty dataframe, the fallback mistakenly does df = df.copy() (copying the empty df) and then sorts it, thereby losing the original unfiltered events. This would silently produce zero samples on logs lacking explicit 'complete' transitions. Fix: preserve the original dataframe before filtering and revert to it when the filtered result is empty, e.g., df_orig = df.copy(); df_f = df[df['lifecycle'].astype(str).str.lower().eq('complete')]; df = df_f if len(df_f)>0 else df_orig; then sort by ['case_id','timestamp']. Additionally, for stricter no-leakage hygiene, build the activity vocabulary and any feature normalization strictly on the train split (the code already re-normalizes by train stats, but vocab is constructed on all data), although this is not a runtime bug. All other components (loss, target indexing, top-3 computation, saving artifacts) appear consistent with the output logs.",
      "metric": {
        "value": {
          "metric_names": [
            {
              "metric_name": "train loss",
              "lower_is_better": true,
              "description": "Training loss at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5148,
                  "best_value": 0.5148
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3607,
                  "best_value": 0.3607
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4662,
                  "best_value": 0.4662
                }
              ]
            },
            {
              "metric_name": "train accuracy",
              "lower_is_better": false,
              "description": "Training accuracy at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7777,
                  "best_value": 0.7777
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8422,
                  "best_value": 0.8422
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.7894,
                  "best_value": 0.7894
                }
              ]
            },
            {
              "metric_name": "train F1 score",
              "lower_is_better": false,
              "description": "Training F1 score at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5609,
                  "best_value": 0.5609
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.5721,
                  "best_value": 0.5721
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.5395,
                  "best_value": 0.5395
                }
              ]
            },
            {
              "metric_name": "train top-3 accuracy",
              "lower_is_better": false,
              "description": "Training top-3 accuracy at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9868,
                  "best_value": 0.9868
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9941,
                  "best_value": 0.9941
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9986,
                  "best_value": 0.9986
                }
              ]
            },
            {
              "metric_name": "validation loss",
              "lower_is_better": true,
              "description": "Validation loss at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5073,
                  "best_value": 0.5073
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3756,
                  "best_value": 0.3756
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4274,
                  "best_value": 0.4274
                }
              ]
            },
            {
              "metric_name": "validation accuracy",
              "lower_is_better": false,
              "description": "Validation accuracy at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7639,
                  "best_value": 0.7639
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8405,
                  "best_value": 0.8405
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.8122,
                  "best_value": 0.8122
                }
              ]
            },
            {
              "metric_name": "validation F1 score",
              "lower_is_better": false,
              "description": "Validation F1 score at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.6007,
                  "best_value": 0.6007
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.618,
                  "best_value": 0.618
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.6664,
                  "best_value": 0.6664
                }
              ]
            },
            {
              "metric_name": "validation top-3 accuracy",
              "lower_is_better": false,
              "description": "Validation top-3 accuracy at the end of training",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9861,
                  "best_value": 0.9861
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9928,
                  "best_value": 0.9928
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9969,
                  "best_value": 0.9969
                }
              ]
            },
            {
              "metric_name": "test loss",
              "lower_is_better": true,
              "description": "Test loss at evaluation time",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5355,
                  "best_value": 0.5355
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3877,
                  "best_value": 0.3877
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4833,
                  "best_value": 0.4833
                }
              ]
            },
            {
              "metric_name": "test accuracy",
              "lower_is_better": false,
              "description": "Test accuracy at evaluation time",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7569,
                  "best_value": 0.7569
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8332,
                  "best_value": 0.8332
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.802,
                  "best_value": 0.802
                }
              ]
            },
            {
              "metric_name": "test F1 score",
              "lower_is_better": false,
              "description": "Test F1 score at evaluation time",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5872,
                  "best_value": 0.5872
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.571,
                  "best_value": 0.571
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.474,
                  "best_value": 0.474
                }
              ]
            },
            {
              "metric_name": "test top-3 accuracy",
              "lower_is_better": false,
              "description": "Test top-3 accuracy at evaluation time",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9874,
                  "best_value": 0.9874
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9906,
                  "best_value": 0.9906
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9936,
                  "best_value": 0.9936
                }
              ]
            }
          ]
        },
        "maximize": null,
        "name": null,
        "description": null
      },
      "code": "# Set random seed\nimport random\nimport numpy as np\nimport torch\n\nseed = 0\nrandom.seed(seed)\nnp.random.seed(seed)\ntorch.manual_seed(seed)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(seed)\n\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import f1_score, accuracy_score\nimport random\nimport math\nimport matplotlib.pyplot as plt\n\n# Device handling (required)\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n# Experiment data structure\nexperiment_data = {\n    \"BPI2012\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"BPI2017\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"ROAD\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n}\n\n# Data loading utilities (use provided helper)\nfrom ai_scientist.ideas.my_research_topic import load_datasets, pick_default_dataset\n\n\n# Reproducibility\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# Build prefixes\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    # Keep only 'complete' transitions if lifecycle exists\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        df = df[df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")]\n        if len(df) == 0:\n            df = df.copy()  # fallback if empty\n            df = df.sort_values([\"case_id\", \"timestamp\"])\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    # Build activity vocab\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}  # 0 for PAD\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        # Convert to numpy arrays for safe positional indexing\n        ts_ns = (\n            pd.to_datetime(g[\"timestamp\"], utc=True).astype(\"int64\").to_numpy()\n        )  # nanoseconds\n        ts = (ts_ns // 10**9).astype(np.int64)  # seconds as numpy array\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str).tolist()], dtype=np.int64\n        )\n        # simple calendar features\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        # time deltas and since start in seconds\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(\n            np.float32\n        )  # [T,5]\n        T = len(acts_ids)\n        if T < 2:\n            continue\n        # Generate prefixes of length k (min_prefix_len..min(max_prefix_len, T-1)); target = activity at position k\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            seq_acts = acts_ids[:k].tolist()\n            seq_feats = feats[:k]\n            target = int(acts_ids[k])\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": seq_acts,\n                    \"seq_feats\": seq_feats.copy(),\n                    \"target\": target,\n                    \"last_ts\": int(ts[k - 1]),\n                    \"next_ts\": int(ts[k]),\n                }\n            )\n\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n\n    # Collect feature normalization stats over all feats (initial; will be recomputed on train split)\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n    ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n    for s in samples:\n        if s[\"seq_feats\"].shape[0] > 0:\n            s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n            s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n    return samples, act2id, id2act, pad_id\n\n\n# Time-based split by case start time\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.zeros((pad_len, self.num_cont), dtype=np.float32)\n        feats_pad = np.vstack([feats_pad, feats.astype(np.float32)])\n        attn_mask = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad, dtype=torch.long),\n            \"feats\": torch.tensor(feats_pad, dtype=torch.float32),\n            \"mask\": torch.tensor(attn_mask, dtype=torch.float32),\n            \"y\": torch.tensor(s[\"target\"], dtype=torch.long),\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(\n        self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, num_layers=1, pad_idx=0\n    ):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim,\n            hidden_size=hidden,\n            batch_first=True,\n            num_layers=num_layers,\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)  # includes PAD index\n        self.pad_idx = pad_idx\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)  # [B,T,emb]\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h_last = h[-1]\n        h_last = self.dropout(h_last)\n        logits = self.fc(h_last)\n        return logits\n\n\ndef collate_fn(batch):\n    keys = batch[0].keys()\n    out = {k: torch.stack([b[k] for b in batch], dim=0) for k in keys}\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys, preds_top1, preds_probs = [], [], []\n    top3_correct = 0\n    n_total = 0\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk_idx = torch.topk(probs, k=k_val, dim=1)\n            ys.extend(batch[\"y\"].detach().cpu().tolist())\n            preds_top1.extend(top1.detach().cpu().tolist())\n            preds_probs.append(probs.detach().cpu().numpy())\n            # top-3 correctness\n            for i in range(batch[\"y\"].size(0)):\n                if batch[\"y\"][i].item() in topk_idx[i].detach().cpu().tolist():\n                    top3_correct += 1\n            n_total += batch[\"y\"].size(0)\n    avg_loss = total_loss / max(1, n_total)\n    y_true = np.array(ys)\n    y_pred = np.array(preds_top1)\n    mask = y_true != pad_idx\n    y_true = y_true[mask]\n    y_pred = y_pred[mask]\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except Exception:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n_total))\n    probs_concat = (\n        np.concatenate(preds_probs, axis=0)\n        if len(preds_probs) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return avg_loss, acc, f1, top3, y_true, y_pred, probs_concat\n\n\ndef train_one_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    # Time-based split\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    # Build samples across all to get vocab; we'll re-normalize with train stats\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    # Filter per split\n    samples_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    samples_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    samples_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # Recompute normalization using train samples only\n    if len(samples_train) > 0:\n        concat_feats = [\n            s[\"seq_feats\"] for s in samples_train if s[\"seq_feats\"].shape[0] > 0\n        ]\n        if len(concat_feats) > 0:\n            all_feats = np.concatenate(concat_feats, axis=0)\n            dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n            ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n\n            def norm_samples(samples):\n                for s in samples:\n                    if s[\"seq_feats\"].shape[0] > 0:\n                        s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                        s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n            norm_samples(samples_train)\n            norm_samples(samples_val)\n            norm_samples(samples_test)\n    print(\n        f\"Samples train/val/test: {len(samples_train)}/{len(samples_val)}/{len(samples_test)}; vocab={len(act2id)}\"\n    )\n    if len(samples_train) == 0 or len(act2id) < 2:\n        print(\"Not enough data to train. Skipping.\")\n        return\n    ds_train = PrefixDataset(\n        samples_train, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_val = PrefixDataset(\n        samples_val, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_test = PrefixDataset(\n        samples_test, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    dl_train = DataLoader(\n        ds_train,\n        batch_size=batch_size,\n        shuffle=True,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_val = DataLoader(\n        ds_val,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_test = DataLoader(\n        ds_test,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n\n    # Model\n    model = LSTMBaseline(\n        vocab_size=len(act2id),\n        emb_dim=64,\n        cont_dim=5,\n        hidden=128,\n        num_layers=1,\n        pad_idx=pad_id,\n    ).to(device)\n    criterion = nn.CrossEntropyLoss().to(device)\n    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n\n    # Training loop\n    best_val_top3 = -1.0\n    best_state = None\n    hist = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for epoch in range(1, max_epochs + 1):\n        model.train()\n        total = 0\n        running_loss = 0.0\n        for batch in dl_train:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            optimizer.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            loss.backward()\n            optimizer.step()\n            running_loss += loss.item() * logits.size(0)\n            total += logits.size(0)\n        train_loss = running_loss / max(1, total)\n        val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n            model, dl_val, criterion, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {epoch}: validation_loss = {val_loss:.4f} | val_acc={val_acc:.4f} | val_f1={val_f1:.4f} | val_top3={val_top3:.4f}\"\n        )\n        hist[\"train_loss\"].append(train_loss)\n        hist[\"val_loss\"].append(val_loss)\n        hist[\"val_top3\"].append(val_top3)\n        experiment_data[name][\"losses\"][\"train\"].append((epoch, train_loss))\n        experiment_data[name][\"losses\"][\"val\"].append((epoch, val_loss))\n        experiment_data[name][\"metrics\"][\"val\"].append(\n            (epoch, {\"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3})\n        )\n        experiment_data[name][\"epochs\"].append(epoch)\n        if val_top3 > best_val_top3:\n            best_val_top3 = val_top3\n            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}\n\n    # Load best\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n\n    # Final eval on train/val/test\n    train_loss, train_acc, train_f1, train_top3, _, _, _ = evaluate(\n        model, dl_train, criterion, device, len(act2id), pad_id\n    )\n    val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n        model, dl_val, criterion, device, len(act2id), pad_id\n    )\n    test_loss, test_acc, test_f1, test_top3, y_true_t, y_pred_t, probs_t = evaluate(\n        model, dl_test, criterion, device, len(act2id), pad_id\n    )\n    print(\n        f\"[{name}] Train: loss={train_loss:.4f} acc={train_acc:.4f} f1={train_f1:.4f} top3={train_top3:.4f}\"\n    )\n    print(\n        f\"[{name}] Test:  loss={test_loss:.4f} acc={test_acc:.4f} f1={test_f1:.4f} top3={test_top3:.4f}\"\n    )\n\n    # Save metrics\n    experiment_data[name][\"metrics\"][\"train\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": train_loss,\n                \"acc\": train_acc,\n                \"macro_f1\": train_f1,\n                \"top3\": train_top3,\n            },\n        )\n    )\n    experiment_data[name][\"metrics\"][\"val\"].append(\n        (\n            \"final\",\n            {\"loss\": val_loss, \"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3},\n        )\n    )\n    experiment_data[name][\"metrics\"][\"test\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": test_loss,\n                \"acc\": test_acc,\n                \"macro_f1\": test_f1,\n                \"top3\": test_top3,\n            },\n        )\n    )\n    experiment_data[name][\"predictions\"] = y_pred_t.tolist()\n    experiment_data[name][\"ground_truth\"] = y_true_t.tolist()\n\n    # Plots\n    try:\n        plt.figure()\n        plt.plot(hist[\"train_loss\"], label=\"train_loss\")\n        plt.plot(hist[\"val_loss\"], label=\"val_loss\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"loss_curves_{name}.png\"))\n        plt.close()\n\n        plt.figure()\n        plt.plot(hist[\"val_top3\"], label=\"val_top3\")\n        plt.legend()\n        plt.title(f\"Val Top-3 Acc - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Top-3 Acc\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"val_top3_{name}.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"[warn] Plotting failed: {e}\")\n\n    # Save confusion matrix-like data (optional)\n    try:\n        from sklearn.metrics import confusion_matrix\n\n        cm = confusion_matrix(y_true_t, y_pred_t)\n        np.save(os.path.join(working_dir, f\"cm_{name}.npy\"), cm)\n    except Exception as e:\n        print(f\"[warn] Confusion matrix failed: {e}\")\n\n\ndef main():\n    datasets = load_datasets()\n    # Loop through loaded datasets; cap to 5000 earliest cases for speed\n    for key, df in datasets.items():\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep_cases = set(starts.iloc[:5000][\"case_id\"])\n                df_small = df[df[\"case_id\"].isin(keep_cases)].copy()\n            else:\n                df_small = df\n        except Exception:\n            df_small = df\n        train_one_dataset(\n            key, df_small, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    np.savez_compressed(\n        os.path.join(working_dir, \"experiment_data_compressed.npz\"),\n        data=experiment_data,\n    )\n\n\n# Execute immediately\nmain()\n",
      "plot_code": "import matplotlib.pyplot as plt\nimport numpy as np\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import (\n    f1_score,\n    accuracy_score,\n    confusion_matrix,\n    precision_recall_curve,\n    average_precision_score,\n)\nfrom sklearn.preprocessing import label_binarize\nimport random\nimport math\nfrom collections import defaultdict\n\n# Reproducibility and device\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# --------- Data loading from local XES via pm4py ----------\ndef load_xes_folder(data_dir=\"data\"):\n    datasets = {}\n    try:\n        import pm4py\n    except Exception as e:\n        print(f\"pm4py not available: {e}\")\n        return datasets\n    if not os.path.isdir(data_dir):\n        print(f\"Data directory not found: {data_dir}\")\n        return datasets\n    for fn in os.listdir(data_dir):\n        if fn.lower().endswith(\".xes\") or fn.lower().endswith(\".xes.gz\"):\n            path = os.path.join(data_dir, fn)\n            try:\n                log = pm4py.read_xes(path)\n                df = pm4py.convert_to_dataframe(log)\n                # Standardize columns\n                # pm4py dataframe typically has case:concept:name, concept:name, time:timestamp, lifecycle:transition\n                cols = df.columns\n                case_col = (\n                    \"case:concept:name\"\n                    if \"case:concept:name\" in cols\n                    else (\"case\" if \"case\" in cols else None)\n                )\n                act_col = (\n                    \"concept:name\"\n                    if \"concept:name\" in cols\n                    else (\"activity\" if \"activity\" in cols else None)\n                )\n                ts_col = (\n                    \"time:timestamp\"\n                    if \"time:timestamp\" in cols\n                    else (\"timestamp\" if \"timestamp\" in cols else None)\n                )\n                life_col = (\n                    \"lifecycle:transition\"\n                    if \"lifecycle:transition\" in cols\n                    else (\"lifecycle\" if \"lifecycle\" in cols else None)\n                )\n                if case_col is None or act_col is None or ts_col is None:\n                    print(f\"Missing required columns in {fn}, skipping.\")\n                    continue\n                out = pd.DataFrame(\n                    {\n                        \"case_id\": df[case_col].astype(str).values,\n                        \"activity\": df[act_col].astype(str).values,\n                        \"timestamp\": pd.to_datetime(df[ts_col], utc=True),\n                    }\n                )\n                if life_col is not None:\n                    out[\"lifecycle\"] = df[life_col].astype(str).values\n                name = os.path.splitext(fn)[0]\n                datasets[name] = out\n                print(\n                    f\"Loaded {name}: {len(out)} events, {out['case_id'].nunique()} cases\"\n                )\n            except Exception as e:\n                print(f\"Failed to load {fn}: {e}\")\n    return datasets\n\n\n# --------- Prefix building and split ----------\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        mask = df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")\n        if mask.any():\n            df = df[mask]\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        if len(g) < 2:\n            continue\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        ts = (g_ts.astype(\"int64\") // 10**9).to_numpy(np.int64)\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str)], dtype=np.int64\n        )\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(np.float32)\n        T = len(acts_ids)\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": acts_ids[:k].tolist(),\n                    \"seq_feats\": feats[:k].copy(),\n                    \"target\": int(acts_ids[k]),\n                    \"prefix_len\": k,\n                }\n            )\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    for s in samples:\n        pass  # initial no norm; will norm on train split\n    return samples, act2id, id2act, pad_id\n\n\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.vstack(\n            [\n                np.zeros((pad_len, self.num_cont), dtype=np.float32),\n                feats.astype(np.float32),\n            ]\n        )\n        attn = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad).long(),\n            \"feats\": torch.tensor(feats_pad).float(),\n            \"mask\": torch.tensor(attn).float(),\n            \"y\": torch.tensor(s[\"target\"]).long(),\n            \"prefix_len\": L,\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, pad_idx=0):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim, hidden_size=hidden, batch_first=True\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h = self.dropout(h[-1])\n        return self.fc(h)\n\n\ndef collate_fn(batch):\n    out = {\n        k: (\n            torch.stack([b[k] for b in batch], 0)\n            if isinstance(batch[0][k], torch.Tensor)\n            else [b[k] for b in batch]\n        )\n        for k in batch[0].keys()\n    }\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys = []\n    yhat = []\n    probs_list = []\n    n = 0\n    top3_correct = 0\n    pref_lens = []\n    top3_flags = []\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk = torch.topk(probs, k=k_val, dim=1)\n            y = batch[\"y\"]\n            ys.extend(y.detach().cpu().tolist())\n            yhat.extend(top1.detach().cpu().tolist())\n            probs_list.append(probs.detach().cpu().numpy())\n            for i in range(y.size(0)):\n                flag = int(y[i].item() in topk[i].detach().cpu().tolist())\n                top3_correct += flag\n                top3_flags.append(flag)\n                pref_lens.append(int(batch[\"prefix_len\"][i].item()))\n            n += y.size(0)\n    avg_loss = total_loss / max(1, n)\n    y_true = np.array(ys)\n    y_pred = np.array(yhat)\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n))\n    probs_concat = (\n        np.concatenate(probs_list, axis=0)\n        if len(probs_list) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return (\n        avg_loss,\n        acc,\n        f1,\n        top3,\n        y_true,\n        y_pred,\n        probs_concat,\n        np.array(pref_lens),\n        np.array(top3_flags),\n    )\n\n\ndef train_on_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    s_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    s_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    s_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # normalize time features on train\n    if len(s_train) > 0:\n        feats = np.concatenate(\n            [s[\"seq_feats\"] for s in s_train if len(s[\"seq_feats\"]) > 0], axis=0\n        )\n        dt_mean, dt_std = feats[:, 0].mean(), feats[:, 0].std() + 1e-6\n        ss_mean, ss_std = feats[:, 1].mean(), feats[:, 1].std() + 1e-6\n\n        def norm(samples):\n            for s in samples:\n                if s[\"seq_feats\"].shape[0] > 0:\n                    s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                    s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n        norm(s_train)\n        norm(s_val)\n        norm(s_test)\n    print(\n        f\"Samples train/val/test: {len(s_train)}/{len(s_val)}/{len(s_test)}; vocab={len(act2id)}\"\n    )\n    if len(s_train) == 0 or len(act2id) < 2:\n        print(\"Insufficient data; skipping.\")\n        return None\n    ds_tr = PrefixDataset(s_train, pad_id, max_prefix_len, 5)\n    ds_va = PrefixDataset(s_val, pad_id, max_prefix_len, 5)\n    ds_te = PrefixDataset(s_test, pad_id, max_prefix_len, 5)\n    dl_tr = DataLoader(\n        ds_tr, batch_size=batch_size, shuffle=True, collate_fn=collate_fn\n    )\n    dl_va = DataLoader(\n        ds_va, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    dl_te = DataLoader(\n        ds_te, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    model = LSTMBaseline(\n        vocab_size=len(act2id), emb_dim=64, cont_dim=5, hidden=128, pad_idx=pad_id\n    ).to(device)\n    crit = nn.CrossEntropyLoss().to(device)\n    opt = torch.optim.Adam(model.parameters(), lr=lr)\n    best_top3 = -1.0\n    best_state = None\n    history = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for ep in range(1, max_epochs + 1):\n        model.train()\n        tot = 0\n        run_loss = 0.0\n        for batch in dl_tr:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            opt.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = crit(logits, batch[\"y\"])\n            loss.backward()\n            opt.step()\n            run_loss += loss.item() * logits.size(0)\n            tot += logits.size(0)\n        tr_loss = run_loss / max(1, tot)\n        va_loss, va_acc, va_f1, va_top3, *_ = evaluate(\n            model, dl_va, crit, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {ep}: val_loss={va_loss:.4f} acc={va_acc:.4f} f1={va_f1:.4f} top3={va_top3:.4f}\"\n        )\n        history[\"train_loss\"].append(tr_loss)\n        history[\"val_loss\"].append(va_loss)\n        history[\"val_top3\"].append(va_top3)\n        if va_top3 > best_top3:\n            best_top3 = va_top3\n            best_state = {\n                k: v.detach().cpu().clone() for k, v in model.state_dict().items()\n            }\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n    tr_loss, tr_acc, tr_f1, tr_top3, *_ = evaluate(\n        model, dl_tr, crit, device, len(act2id), pad_id\n    )\n    te_loss, te_acc, te_f1, te_top3, y_true, y_pred, probs, pref_lens, top3_flags = (\n        evaluate(model, dl_te, crit, device, len(act2id), pad_id)\n    )\n    print(\n        f\"[{name}] Test: loss={te_loss:.4f} acc={te_acc:.4f} f1={te_f1:.4f} top3={te_top3:.4f}\"\n    )\n    # package experiment data\n    exp = {\n        \"metrics\": {\n            \"train\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": tr_loss,\n                        \"acc\": tr_acc,\n                        \"macro_f1\": tr_f1,\n                        \"top3\": tr_top3,\n                    },\n                )\n            ],\n            \"val\": [],\n            \"test\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": te_loss,\n                        \"acc\": te_acc,\n                        \"macro_f1\": te_f1,\n                        \"top3\": te_top3,\n                    },\n                )\n            ],\n        },\n        \"losses\": {\n            \"train\": list(enumerate(history[\"train_loss\"], start=1)),\n            \"val\": list(enumerate(history[\"val_loss\"], start=1)),\n        },\n        \"predictions\": y_pred.tolist(),\n        \"ground_truth\": y_true.tolist(),\n        \"epochs\": list(range(1, len(history[\"train_loss\"]) + 1)),\n        \"probs\": probs,\n        \"prefix_lens\": pref_lens.tolist(),\n        \"top3_flags\": top3_flags.tolist(),\n        \"act2id\": act2id,\n    }\n    # plots for this dataset\n    try:\n        plt.figure()\n        plt.plot(history[\"train_loss\"], label=\"train\")\n        plt.plot(history[\"val_loss\"], label=\"val\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating loss curves for {name}: {e}\")\n        plt.close()\n    try:\n        cm = confusion_matrix(y_true, y_pred)\n        plt.figure(figsize=(5, 4))\n        plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n        plt.colorbar()\n        plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n        plt.xlabel(\"Predicted\")\n        plt.ylabel(\"True\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_confusion_matrix.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating confusion matrix for {name}: {e}\")\n        plt.close()\n    try:\n        # Top-3 accuracy vs prefix length\n        if len(pref_lens) > 0:\n            d = defaultdict(list)\n            for L, flag in zip(pref_lens, top3_flags):\n                d[int(L)].append(int(flag))\n            xs = sorted(d.keys())\n            ys = [np.mean(d[k]) for k in xs]\n            plt.figure()\n            plt.plot(xs, ys, marker=\"o\")\n            plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n            plt.xlabel(\"Prefix Length\")\n            plt.ylabel(\"Top-3 Accuracy\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_top3_vs_prefixlen.png\"))\n            plt.close()\n    except Exception as e:\n        print(f\"Error creating Top-3 vs prefix length for {name}: {e}\")\n        plt.close()\n    try:\n        # Macro PR curve (one-vs-rest); may be coarse due to many classes\n        if probs.shape[0] > 0:\n            classes = np.unique(y_true)\n            Y = label_binarize(y_true, classes=range(probs.shape[1]))\n            # only keep columns present in classes to avoid PAD\n            present = [c for c in classes]\n            if len(present) > 1:\n                precisions = []\n                recalls = []\n                aps = []\n                for c in present:\n                    p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                    ap = average_precision_score(Y[:, c], probs[:, c])\n                    precisions.append(\n                        np.interp(np.linspace(0, 1, 101), r[::-1], p[::-1])\n                    )\n                    recalls.append(np.linspace(0, 1, 101))\n                    aps.append(ap)\n                macro_p = np.mean(np.stack(precisions, 0), 0)\n                macro_r = np.linspace(0, 1, 101)\n                plt.figure()\n                plt.plot(macro_r, macro_p, label=f\"Macro-PR (mAP={np.mean(aps):.3f})\")\n                plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Recall\")\n                plt.ylabel(\"Precision\")\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(os.path.join(working_dir, f\"{name}_macro_pr.png\"))\n                plt.close()\n    except Exception as e:\n        print(f\"Error creating PR curve for {name}: {e}\")\n        plt.close()\n    return name, exp\n\n\ndef main():\n    datasets = load_xes_folder(data_dir=os.path.join(os.getcwd(), \"data\"))\n    experiment_data = {}\n    for name, df in datasets.items():\n        # optional cap earliest 5000 cases\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep = set(starts.iloc[:5000][\"case_id\"])\n                df = df[df[\"case_id\"].isin(keep)].copy()\n        except:\n            pass\n        res = train_on_dataset(\n            name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n        if res is not None:\n            k, exp = res\n            experiment_data[k] = exp\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    # Print evaluation metrics\n    for k, v in experiment_data.items():\n        test_metrics = dict(v[\"metrics\"][\"test\"][0][1])\n        print(\n            f\"{k} | Test acc={test_metrics['acc']:.4f} macro_f1={test_metrics['macro_f1']:.4f} top3={test_metrics['top3']:.4f} loss={test_metrics['loss']:.4f}\"\n        )\n\n    # Secondary plotting pass strictly from experiment_data.npy (as required)\n    try:\n        experiment_data_loaded = np.load(\n            os.path.join(working_dir, \"experiment_data.npy\"), allow_pickle=True\n        ).item()\n    except Exception as e:\n        print(f\"Error loading experiment data: {e}\")\n        experiment_data_loaded = {}\n    for name, ed in experiment_data_loaded.items():\n        try:\n            # re-plot loss curves from saved data\n            plt.figure()\n            tl = [y for (_, y) in ed.get(\"losses\", {}).get(\"train\", [])]\n            vl = [y for (_, y) in ed.get(\"losses\", {}).get(\"val\", [])]\n            if len(tl) > 0:\n                plt.plot(tl, label=\"train\")\n            if len(vl) > 0:\n                plt.plot(vl, label=\"val\")\n            plt.legend()\n            plt.title(f\"Loss Curves - {name}\\nNext-activity\")\n            plt.xlabel(\"Epoch\")\n            plt.ylabel(\"Loss\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves_reload.png\"))\n            plt.close()\n        except Exception as e:\n            print(f\"Error creating plot1: {e}\")\n            plt.close()\n        try:\n            # confusion matrix from predictions and ground truth\n            y_true = ed.get(\"ground_truth\", [])\n            y_pred = ed.get(\"predictions\", [])\n            if len(y_true) > 0 and len(y_pred) > 0:\n                cm = confusion_matrix(y_true, y_pred)\n                plt.figure(figsize=(5, 4))\n                plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n                plt.colorbar()\n                plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Predicted\")\n                plt.ylabel(\"True\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_confusion_matrix_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot2: {e}\")\n            plt.close()\n        try:\n            # Top-3 vs prefix length if present\n            pref = ed.get(\"prefix_lens\", [])\n            flags = ed.get(\"top3_flags\", [])\n            if len(pref) > 0 and len(flags) > 0:\n                d = defaultdict(list)\n                for L, f in zip(pref, flags):\n                    d[int(L)].append(int(f))\n                xs = sorted(d.keys())\n                ys = [float(np.mean(d[x])) for x in xs]\n                plt.figure()\n                plt.plot(xs, ys, marker=\"o\")\n                plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n                plt.xlabel(\"Prefix Length\")\n                plt.ylabel(\"Top-3 Accuracy\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_top3_vs_prefixlen_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot3: {e}\")\n            plt.close()\n        try:\n            # Macro PR curve if probs available\n            probs = np.array(ed.get(\"probs\", []))\n            y_true = ed.get(\"ground_truth\", [])\n            if probs.size > 0 and len(y_true) > 0:\n                classes = sorted(set(y_true))\n                Y = label_binarize(np.array(y_true), classes=range(probs.shape[1]))\n                present = classes\n                if len(present) > 1:\n                    precisions = []\n                    aps = []\n                    grid = np.linspace(0, 1, 101)\n                    for c in present:\n                        p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                        precisions.append(np.interp(grid, r[::-1], p[::-1]))\n                        aps.append(average_precision_score(Y[:, c], probs[:, c]))\n                    macro_p = np.mean(np.stack(precisions, 0), 0)\n                    plt.figure()\n                    plt.plot(grid, macro_p, label=f\"mAP={np.mean(aps):.3f}\")\n                    plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                    plt.xlabel(\"Recall\")\n                    plt.ylabel(\"Precision\")\n                    plt.legend()\n                    plt.tight_layout()\n                    plt.savefig(\n                        os.path.join(working_dir, f\"{name}_macro_pr_reload.png\")\n                    )\n                    plt.close()\n        except Exception as e:\n            print(f\"Error creating plot4: {e}\")\n            plt.close()\n\n\nif __name__ == \"__main__\":\n    main()\n",
      "plot_analyses": [],
      "plot_paths": [],
      "vlm_feedback_summary": []
    },
    {
      "overall_plan": "Establish a stable baseline pipeline by fixing prefix-generation and indexing issues and then proceed with the originally intended end-to-end predictive process monitoring workflow. Specifically, convert per-case timestamps from pandas Series to dense numpy arrays to enforce positional indexing and avoid KeyError due to label-based integer indexing. Constrain the prefix generation loop to stop at T-1 so every prefix has a valid next event, eliminating off-by-one errors. Keep the rest of the pipeline unchanged: perform a time-based split by case start time, build prefix features, train and evaluate a simple LSTM baseline, and compute/save the required BPM/PPM metrics. The current node functions as a seed node that codifies this corrected baseline without introducing new methodological changes, serving as the foundation for subsequent experimentation.",
      "analysis": "",
      "metric": {
        "value": {
          "metric_names": [
            {
              "metric_name": "train loss",
              "lower_is_better": true,
              "description": "Cross-entropy loss on the training split; lower indicates better fit.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5148,
                  "best_value": 0.5148
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3607,
                  "best_value": 0.3607
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4662,
                  "best_value": 0.4662
                }
              ]
            },
            {
              "metric_name": "train accuracy",
              "lower_is_better": false,
              "description": "Classification accuracy on the training split; higher is better.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7777,
                  "best_value": 0.7777
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8422,
                  "best_value": 0.8422
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.7894,
                  "best_value": 0.7894
                }
              ]
            },
            {
              "metric_name": "train F1 score",
              "lower_is_better": false,
              "description": "F1 score on the training split; harmonic mean of precision and recall.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5609,
                  "best_value": 0.5609
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.5721,
                  "best_value": 0.5721
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.5395,
                  "best_value": 0.5395
                }
              ]
            },
            {
              "metric_name": "train top-3 accuracy",
              "lower_is_better": false,
              "description": "Top-3 accuracy on the training split; prediction is correct if the true label is among the top 3 predictions.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9868,
                  "best_value": 0.9868
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9941,
                  "best_value": 0.9941
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9986,
                  "best_value": 0.9986
                }
              ]
            },
            {
              "metric_name": "validation loss",
              "lower_is_better": true,
              "description": "Cross-entropy loss on the validation split; lower indicates better generalization.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5073,
                  "best_value": 0.5073
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3756,
                  "best_value": 0.3756
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4274,
                  "best_value": 0.4274
                }
              ]
            },
            {
              "metric_name": "validation accuracy",
              "lower_is_better": false,
              "description": "Classification accuracy on the validation split; higher is better.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7639,
                  "best_value": 0.7639
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8405,
                  "best_value": 0.8405
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.8122,
                  "best_value": 0.8122
                }
              ]
            },
            {
              "metric_name": "validation F1 score",
              "lower_is_better": false,
              "description": "F1 score on the validation split; harmonic mean of precision and recall.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.6007,
                  "best_value": 0.6007
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.618,
                  "best_value": 0.618
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.6664,
                  "best_value": 0.6664
                }
              ]
            },
            {
              "metric_name": "validation top-3 accuracy",
              "lower_is_better": false,
              "description": "Top-3 accuracy on the validation split; prediction is correct if the true label is among the top 3 predictions.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9861,
                  "best_value": 0.9861
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9928,
                  "best_value": 0.9928
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9969,
                  "best_value": 0.9969
                }
              ]
            },
            {
              "metric_name": "test loss",
              "lower_is_better": true,
              "description": "Cross-entropy loss on the test split; lower indicates better generalization.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5355,
                  "best_value": 0.5355
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3877,
                  "best_value": 0.3877
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4833,
                  "best_value": 0.4833
                }
              ]
            },
            {
              "metric_name": "test accuracy",
              "lower_is_better": false,
              "description": "Classification accuracy on the test split; higher is better.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7569,
                  "best_value": 0.7569
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8332,
                  "best_value": 0.8332
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.802,
                  "best_value": 0.802
                }
              ]
            },
            {
              "metric_name": "test F1 score",
              "lower_is_better": false,
              "description": "F1 score on the test split; harmonic mean of precision and recall.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5872,
                  "best_value": 0.5872
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.571,
                  "best_value": 0.571
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.474,
                  "best_value": 0.474
                }
              ]
            },
            {
              "metric_name": "test top-3 accuracy",
              "lower_is_better": false,
              "description": "Top-3 accuracy on the test split; prediction is correct if the true label is among the top 3 predictions.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9874,
                  "best_value": 0.9874
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9906,
                  "best_value": 0.9906
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9936,
                  "best_value": 0.9936
                }
              ]
            }
          ]
        },
        "maximize": null,
        "name": null,
        "description": null
      },
      "code": "# Set random seed\nimport random\nimport numpy as np\nimport torch\n\nseed = 1\nrandom.seed(seed)\nnp.random.seed(seed)\ntorch.manual_seed(seed)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(seed)\n\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import f1_score, accuracy_score\nimport random\nimport math\nimport matplotlib.pyplot as plt\n\n# Device handling (required)\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n# Experiment data structure\nexperiment_data = {\n    \"BPI2012\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"BPI2017\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"ROAD\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n}\n\n# Data loading utilities (use provided helper)\nfrom ai_scientist.ideas.my_research_topic import load_datasets, pick_default_dataset\n\n\n# Reproducibility\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# Build prefixes\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    # Keep only 'complete' transitions if lifecycle exists\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        df = df[df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")]\n        if len(df) == 0:\n            df = df.copy()  # fallback if empty\n            df = df.sort_values([\"case_id\", \"timestamp\"])\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    # Build activity vocab\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}  # 0 for PAD\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        # Convert to numpy arrays for safe positional indexing\n        ts_ns = (\n            pd.to_datetime(g[\"timestamp\"], utc=True).astype(\"int64\").to_numpy()\n        )  # nanoseconds\n        ts = (ts_ns // 10**9).astype(np.int64)  # seconds as numpy array\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str).tolist()], dtype=np.int64\n        )\n        # simple calendar features\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        # time deltas and since start in seconds\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(\n            np.float32\n        )  # [T,5]\n        T = len(acts_ids)\n        if T < 2:\n            continue\n        # Generate prefixes of length k (min_prefix_len..min(max_prefix_len, T-1)); target = activity at position k\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            seq_acts = acts_ids[:k].tolist()\n            seq_feats = feats[:k]\n            target = int(acts_ids[k])\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": seq_acts,\n                    \"seq_feats\": seq_feats.copy(),\n                    \"target\": target,\n                    \"last_ts\": int(ts[k - 1]),\n                    \"next_ts\": int(ts[k]),\n                }\n            )\n\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n\n    # Collect feature normalization stats over all feats (initial; will be recomputed on train split)\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n    ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n    for s in samples:\n        if s[\"seq_feats\"].shape[0] > 0:\n            s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n            s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n    return samples, act2id, id2act, pad_id\n\n\n# Time-based split by case start time\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.zeros((pad_len, self.num_cont), dtype=np.float32)\n        feats_pad = np.vstack([feats_pad, feats.astype(np.float32)])\n        attn_mask = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad, dtype=torch.long),\n            \"feats\": torch.tensor(feats_pad, dtype=torch.float32),\n            \"mask\": torch.tensor(attn_mask, dtype=torch.float32),\n            \"y\": torch.tensor(s[\"target\"], dtype=torch.long),\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(\n        self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, num_layers=1, pad_idx=0\n    ):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim,\n            hidden_size=hidden,\n            batch_first=True,\n            num_layers=num_layers,\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)  # includes PAD index\n        self.pad_idx = pad_idx\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)  # [B,T,emb]\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h_last = h[-1]\n        h_last = self.dropout(h_last)\n        logits = self.fc(h_last)\n        return logits\n\n\ndef collate_fn(batch):\n    keys = batch[0].keys()\n    out = {k: torch.stack([b[k] for b in batch], dim=0) for k in keys}\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys, preds_top1, preds_probs = [], [], []\n    top3_correct = 0\n    n_total = 0\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk_idx = torch.topk(probs, k=k_val, dim=1)\n            ys.extend(batch[\"y\"].detach().cpu().tolist())\n            preds_top1.extend(top1.detach().cpu().tolist())\n            preds_probs.append(probs.detach().cpu().numpy())\n            # top-3 correctness\n            for i in range(batch[\"y\"].size(0)):\n                if batch[\"y\"][i].item() in topk_idx[i].detach().cpu().tolist():\n                    top3_correct += 1\n            n_total += batch[\"y\"].size(0)\n    avg_loss = total_loss / max(1, n_total)\n    y_true = np.array(ys)\n    y_pred = np.array(preds_top1)\n    mask = y_true != pad_idx\n    y_true = y_true[mask]\n    y_pred = y_pred[mask]\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except Exception:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n_total))\n    probs_concat = (\n        np.concatenate(preds_probs, axis=0)\n        if len(preds_probs) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return avg_loss, acc, f1, top3, y_true, y_pred, probs_concat\n\n\ndef train_one_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    # Time-based split\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    # Build samples across all to get vocab; we'll re-normalize with train stats\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    # Filter per split\n    samples_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    samples_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    samples_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # Recompute normalization using train samples only\n    if len(samples_train) > 0:\n        concat_feats = [\n            s[\"seq_feats\"] for s in samples_train if s[\"seq_feats\"].shape[0] > 0\n        ]\n        if len(concat_feats) > 0:\n            all_feats = np.concatenate(concat_feats, axis=0)\n            dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n            ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n\n            def norm_samples(samples):\n                for s in samples:\n                    if s[\"seq_feats\"].shape[0] > 0:\n                        s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                        s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n            norm_samples(samples_train)\n            norm_samples(samples_val)\n            norm_samples(samples_test)\n    print(\n        f\"Samples train/val/test: {len(samples_train)}/{len(samples_val)}/{len(samples_test)}; vocab={len(act2id)}\"\n    )\n    if len(samples_train) == 0 or len(act2id) < 2:\n        print(\"Not enough data to train. Skipping.\")\n        return\n    ds_train = PrefixDataset(\n        samples_train, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_val = PrefixDataset(\n        samples_val, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_test = PrefixDataset(\n        samples_test, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    dl_train = DataLoader(\n        ds_train,\n        batch_size=batch_size,\n        shuffle=True,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_val = DataLoader(\n        ds_val,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_test = DataLoader(\n        ds_test,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n\n    # Model\n    model = LSTMBaseline(\n        vocab_size=len(act2id),\n        emb_dim=64,\n        cont_dim=5,\n        hidden=128,\n        num_layers=1,\n        pad_idx=pad_id,\n    ).to(device)\n    criterion = nn.CrossEntropyLoss().to(device)\n    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n\n    # Training loop\n    best_val_top3 = -1.0\n    best_state = None\n    hist = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for epoch in range(1, max_epochs + 1):\n        model.train()\n        total = 0\n        running_loss = 0.0\n        for batch in dl_train:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            optimizer.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            loss.backward()\n            optimizer.step()\n            running_loss += loss.item() * logits.size(0)\n            total += logits.size(0)\n        train_loss = running_loss / max(1, total)\n        val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n            model, dl_val, criterion, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {epoch}: validation_loss = {val_loss:.4f} | val_acc={val_acc:.4f} | val_f1={val_f1:.4f} | val_top3={val_top3:.4f}\"\n        )\n        hist[\"train_loss\"].append(train_loss)\n        hist[\"val_loss\"].append(val_loss)\n        hist[\"val_top3\"].append(val_top3)\n        experiment_data[name][\"losses\"][\"train\"].append((epoch, train_loss))\n        experiment_data[name][\"losses\"][\"val\"].append((epoch, val_loss))\n        experiment_data[name][\"metrics\"][\"val\"].append(\n            (epoch, {\"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3})\n        )\n        experiment_data[name][\"epochs\"].append(epoch)\n        if val_top3 > best_val_top3:\n            best_val_top3 = val_top3\n            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}\n\n    # Load best\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n\n    # Final eval on train/val/test\n    train_loss, train_acc, train_f1, train_top3, _, _, _ = evaluate(\n        model, dl_train, criterion, device, len(act2id), pad_id\n    )\n    val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n        model, dl_val, criterion, device, len(act2id), pad_id\n    )\n    test_loss, test_acc, test_f1, test_top3, y_true_t, y_pred_t, probs_t = evaluate(\n        model, dl_test, criterion, device, len(act2id), pad_id\n    )\n    print(\n        f\"[{name}] Train: loss={train_loss:.4f} acc={train_acc:.4f} f1={train_f1:.4f} top3={train_top3:.4f}\"\n    )\n    print(\n        f\"[{name}] Test:  loss={test_loss:.4f} acc={test_acc:.4f} f1={test_f1:.4f} top3={test_top3:.4f}\"\n    )\n\n    # Save metrics\n    experiment_data[name][\"metrics\"][\"train\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": train_loss,\n                \"acc\": train_acc,\n                \"macro_f1\": train_f1,\n                \"top3\": train_top3,\n            },\n        )\n    )\n    experiment_data[name][\"metrics\"][\"val\"].append(\n        (\n            \"final\",\n            {\"loss\": val_loss, \"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3},\n        )\n    )\n    experiment_data[name][\"metrics\"][\"test\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": test_loss,\n                \"acc\": test_acc,\n                \"macro_f1\": test_f1,\n                \"top3\": test_top3,\n            },\n        )\n    )\n    experiment_data[name][\"predictions\"] = y_pred_t.tolist()\n    experiment_data[name][\"ground_truth\"] = y_true_t.tolist()\n\n    # Plots\n    try:\n        plt.figure()\n        plt.plot(hist[\"train_loss\"], label=\"train_loss\")\n        plt.plot(hist[\"val_loss\"], label=\"val_loss\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"loss_curves_{name}.png\"))\n        plt.close()\n\n        plt.figure()\n        plt.plot(hist[\"val_top3\"], label=\"val_top3\")\n        plt.legend()\n        plt.title(f\"Val Top-3 Acc - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Top-3 Acc\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"val_top3_{name}.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"[warn] Plotting failed: {e}\")\n\n    # Save confusion matrix-like data (optional)\n    try:\n        from sklearn.metrics import confusion_matrix\n\n        cm = confusion_matrix(y_true_t, y_pred_t)\n        np.save(os.path.join(working_dir, f\"cm_{name}.npy\"), cm)\n    except Exception as e:\n        print(f\"[warn] Confusion matrix failed: {e}\")\n\n\ndef main():\n    datasets = load_datasets()\n    # Loop through loaded datasets; cap to 5000 earliest cases for speed\n    for key, df in datasets.items():\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep_cases = set(starts.iloc[:5000][\"case_id\"])\n                df_small = df[df[\"case_id\"].isin(keep_cases)].copy()\n            else:\n                df_small = df\n        except Exception:\n            df_small = df\n        train_one_dataset(\n            key, df_small, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    np.savez_compressed(\n        os.path.join(working_dir, \"experiment_data_compressed.npz\"),\n        data=experiment_data,\n    )\n\n\n# Execute immediately\nmain()\n",
      "plot_code": "import matplotlib.pyplot as plt\nimport numpy as np\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import (\n    f1_score,\n    accuracy_score,\n    confusion_matrix,\n    precision_recall_curve,\n    average_precision_score,\n)\nfrom sklearn.preprocessing import label_binarize\nimport random\nimport math\nfrom collections import defaultdict\n\n# Reproducibility and device\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# --------- Data loading from local XES via pm4py ----------\ndef load_xes_folder(data_dir=\"data\"):\n    datasets = {}\n    try:\n        import pm4py\n    except Exception as e:\n        print(f\"pm4py not available: {e}\")\n        return datasets\n    if not os.path.isdir(data_dir):\n        print(f\"Data directory not found: {data_dir}\")\n        return datasets\n    for fn in os.listdir(data_dir):\n        if fn.lower().endswith(\".xes\") or fn.lower().endswith(\".xes.gz\"):\n            path = os.path.join(data_dir, fn)\n            try:\n                log = pm4py.read_xes(path)\n                df = pm4py.convert_to_dataframe(log)\n                # Standardize columns\n                # pm4py dataframe typically has case:concept:name, concept:name, time:timestamp, lifecycle:transition\n                cols = df.columns\n                case_col = (\n                    \"case:concept:name\"\n                    if \"case:concept:name\" in cols\n                    else (\"case\" if \"case\" in cols else None)\n                )\n                act_col = (\n                    \"concept:name\"\n                    if \"concept:name\" in cols\n                    else (\"activity\" if \"activity\" in cols else None)\n                )\n                ts_col = (\n                    \"time:timestamp\"\n                    if \"time:timestamp\" in cols\n                    else (\"timestamp\" if \"timestamp\" in cols else None)\n                )\n                life_col = (\n                    \"lifecycle:transition\"\n                    if \"lifecycle:transition\" in cols\n                    else (\"lifecycle\" if \"lifecycle\" in cols else None)\n                )\n                if case_col is None or act_col is None or ts_col is None:\n                    print(f\"Missing required columns in {fn}, skipping.\")\n                    continue\n                out = pd.DataFrame(\n                    {\n                        \"case_id\": df[case_col].astype(str).values,\n                        \"activity\": df[act_col].astype(str).values,\n                        \"timestamp\": pd.to_datetime(df[ts_col], utc=True),\n                    }\n                )\n                if life_col is not None:\n                    out[\"lifecycle\"] = df[life_col].astype(str).values\n                name = os.path.splitext(fn)[0]\n                datasets[name] = out\n                print(\n                    f\"Loaded {name}: {len(out)} events, {out['case_id'].nunique()} cases\"\n                )\n            except Exception as e:\n                print(f\"Failed to load {fn}: {e}\")\n    return datasets\n\n\n# --------- Prefix building and split ----------\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        mask = df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")\n        if mask.any():\n            df = df[mask]\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        if len(g) < 2:\n            continue\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        ts = (g_ts.astype(\"int64\") // 10**9).to_numpy(np.int64)\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str)], dtype=np.int64\n        )\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(np.float32)\n        T = len(acts_ids)\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": acts_ids[:k].tolist(),\n                    \"seq_feats\": feats[:k].copy(),\n                    \"target\": int(acts_ids[k]),\n                    \"prefix_len\": k,\n                }\n            )\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    for s in samples:\n        pass  # initial no norm; will norm on train split\n    return samples, act2id, id2act, pad_id\n\n\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.vstack(\n            [\n                np.zeros((pad_len, self.num_cont), dtype=np.float32),\n                feats.astype(np.float32),\n            ]\n        )\n        attn = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad).long(),\n            \"feats\": torch.tensor(feats_pad).float(),\n            \"mask\": torch.tensor(attn).float(),\n            \"y\": torch.tensor(s[\"target\"]).long(),\n            \"prefix_len\": L,\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, pad_idx=0):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim, hidden_size=hidden, batch_first=True\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h = self.dropout(h[-1])\n        return self.fc(h)\n\n\ndef collate_fn(batch):\n    out = {\n        k: (\n            torch.stack([b[k] for b in batch], 0)\n            if isinstance(batch[0][k], torch.Tensor)\n            else [b[k] for b in batch]\n        )\n        for k in batch[0].keys()\n    }\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys = []\n    yhat = []\n    probs_list = []\n    n = 0\n    top3_correct = 0\n    pref_lens = []\n    top3_flags = []\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk = torch.topk(probs, k=k_val, dim=1)\n            y = batch[\"y\"]\n            ys.extend(y.detach().cpu().tolist())\n            yhat.extend(top1.detach().cpu().tolist())\n            probs_list.append(probs.detach().cpu().numpy())\n            for i in range(y.size(0)):\n                flag = int(y[i].item() in topk[i].detach().cpu().tolist())\n                top3_correct += flag\n                top3_flags.append(flag)\n                pref_lens.append(int(batch[\"prefix_len\"][i].item()))\n            n += y.size(0)\n    avg_loss = total_loss / max(1, n)\n    y_true = np.array(ys)\n    y_pred = np.array(yhat)\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n))\n    probs_concat = (\n        np.concatenate(probs_list, axis=0)\n        if len(probs_list) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return (\n        avg_loss,\n        acc,\n        f1,\n        top3,\n        y_true,\n        y_pred,\n        probs_concat,\n        np.array(pref_lens),\n        np.array(top3_flags),\n    )\n\n\ndef train_on_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    s_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    s_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    s_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # normalize time features on train\n    if len(s_train) > 0:\n        feats = np.concatenate(\n            [s[\"seq_feats\"] for s in s_train if len(s[\"seq_feats\"]) > 0], axis=0\n        )\n        dt_mean, dt_std = feats[:, 0].mean(), feats[:, 0].std() + 1e-6\n        ss_mean, ss_std = feats[:, 1].mean(), feats[:, 1].std() + 1e-6\n\n        def norm(samples):\n            for s in samples:\n                if s[\"seq_feats\"].shape[0] > 0:\n                    s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                    s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n        norm(s_train)\n        norm(s_val)\n        norm(s_test)\n    print(\n        f\"Samples train/val/test: {len(s_train)}/{len(s_val)}/{len(s_test)}; vocab={len(act2id)}\"\n    )\n    if len(s_train) == 0 or len(act2id) < 2:\n        print(\"Insufficient data; skipping.\")\n        return None\n    ds_tr = PrefixDataset(s_train, pad_id, max_prefix_len, 5)\n    ds_va = PrefixDataset(s_val, pad_id, max_prefix_len, 5)\n    ds_te = PrefixDataset(s_test, pad_id, max_prefix_len, 5)\n    dl_tr = DataLoader(\n        ds_tr, batch_size=batch_size, shuffle=True, collate_fn=collate_fn\n    )\n    dl_va = DataLoader(\n        ds_va, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    dl_te = DataLoader(\n        ds_te, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    model = LSTMBaseline(\n        vocab_size=len(act2id), emb_dim=64, cont_dim=5, hidden=128, pad_idx=pad_id\n    ).to(device)\n    crit = nn.CrossEntropyLoss().to(device)\n    opt = torch.optim.Adam(model.parameters(), lr=lr)\n    best_top3 = -1.0\n    best_state = None\n    history = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for ep in range(1, max_epochs + 1):\n        model.train()\n        tot = 0\n        run_loss = 0.0\n        for batch in dl_tr:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            opt.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = crit(logits, batch[\"y\"])\n            loss.backward()\n            opt.step()\n            run_loss += loss.item() * logits.size(0)\n            tot += logits.size(0)\n        tr_loss = run_loss / max(1, tot)\n        va_loss, va_acc, va_f1, va_top3, *_ = evaluate(\n            model, dl_va, crit, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {ep}: val_loss={va_loss:.4f} acc={va_acc:.4f} f1={va_f1:.4f} top3={va_top3:.4f}\"\n        )\n        history[\"train_loss\"].append(tr_loss)\n        history[\"val_loss\"].append(va_loss)\n        history[\"val_top3\"].append(va_top3)\n        if va_top3 > best_top3:\n            best_top3 = va_top3\n            best_state = {\n                k: v.detach().cpu().clone() for k, v in model.state_dict().items()\n            }\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n    tr_loss, tr_acc, tr_f1, tr_top3, *_ = evaluate(\n        model, dl_tr, crit, device, len(act2id), pad_id\n    )\n    te_loss, te_acc, te_f1, te_top3, y_true, y_pred, probs, pref_lens, top3_flags = (\n        evaluate(model, dl_te, crit, device, len(act2id), pad_id)\n    )\n    print(\n        f\"[{name}] Test: loss={te_loss:.4f} acc={te_acc:.4f} f1={te_f1:.4f} top3={te_top3:.4f}\"\n    )\n    # package experiment data\n    exp = {\n        \"metrics\": {\n            \"train\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": tr_loss,\n                        \"acc\": tr_acc,\n                        \"macro_f1\": tr_f1,\n                        \"top3\": tr_top3,\n                    },\n                )\n            ],\n            \"val\": [],\n            \"test\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": te_loss,\n                        \"acc\": te_acc,\n                        \"macro_f1\": te_f1,\n                        \"top3\": te_top3,\n                    },\n                )\n            ],\n        },\n        \"losses\": {\n            \"train\": list(enumerate(history[\"train_loss\"], start=1)),\n            \"val\": list(enumerate(history[\"val_loss\"], start=1)),\n        },\n        \"predictions\": y_pred.tolist(),\n        \"ground_truth\": y_true.tolist(),\n        \"epochs\": list(range(1, len(history[\"train_loss\"]) + 1)),\n        \"probs\": probs,\n        \"prefix_lens\": pref_lens.tolist(),\n        \"top3_flags\": top3_flags.tolist(),\n        \"act2id\": act2id,\n    }\n    # plots for this dataset\n    try:\n        plt.figure()\n        plt.plot(history[\"train_loss\"], label=\"train\")\n        plt.plot(history[\"val_loss\"], label=\"val\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating loss curves for {name}: {e}\")\n        plt.close()\n    try:\n        cm = confusion_matrix(y_true, y_pred)\n        plt.figure(figsize=(5, 4))\n        plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n        plt.colorbar()\n        plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n        plt.xlabel(\"Predicted\")\n        plt.ylabel(\"True\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_confusion_matrix.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating confusion matrix for {name}: {e}\")\n        plt.close()\n    try:\n        # Top-3 accuracy vs prefix length\n        if len(pref_lens) > 0:\n            d = defaultdict(list)\n            for L, flag in zip(pref_lens, top3_flags):\n                d[int(L)].append(int(flag))\n            xs = sorted(d.keys())\n            ys = [np.mean(d[k]) for k in xs]\n            plt.figure()\n            plt.plot(xs, ys, marker=\"o\")\n            plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n            plt.xlabel(\"Prefix Length\")\n            plt.ylabel(\"Top-3 Accuracy\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_top3_vs_prefixlen.png\"))\n            plt.close()\n    except Exception as e:\n        print(f\"Error creating Top-3 vs prefix length for {name}: {e}\")\n        plt.close()\n    try:\n        # Macro PR curve (one-vs-rest); may be coarse due to many classes\n        if probs.shape[0] > 0:\n            classes = np.unique(y_true)\n            Y = label_binarize(y_true, classes=range(probs.shape[1]))\n            # only keep columns present in classes to avoid PAD\n            present = [c for c in classes]\n            if len(present) > 1:\n                precisions = []\n                recalls = []\n                aps = []\n                for c in present:\n                    p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                    ap = average_precision_score(Y[:, c], probs[:, c])\n                    precisions.append(\n                        np.interp(np.linspace(0, 1, 101), r[::-1], p[::-1])\n                    )\n                    recalls.append(np.linspace(0, 1, 101))\n                    aps.append(ap)\n                macro_p = np.mean(np.stack(precisions, 0), 0)\n                macro_r = np.linspace(0, 1, 101)\n                plt.figure()\n                plt.plot(macro_r, macro_p, label=f\"Macro-PR (mAP={np.mean(aps):.3f})\")\n                plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Recall\")\n                plt.ylabel(\"Precision\")\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(os.path.join(working_dir, f\"{name}_macro_pr.png\"))\n                plt.close()\n    except Exception as e:\n        print(f\"Error creating PR curve for {name}: {e}\")\n        plt.close()\n    return name, exp\n\n\ndef main():\n    datasets = load_xes_folder(data_dir=os.path.join(os.getcwd(), \"data\"))\n    experiment_data = {}\n    for name, df in datasets.items():\n        # optional cap earliest 5000 cases\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep = set(starts.iloc[:5000][\"case_id\"])\n                df = df[df[\"case_id\"].isin(keep)].copy()\n        except:\n            pass\n        res = train_on_dataset(\n            name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n        if res is not None:\n            k, exp = res\n            experiment_data[k] = exp\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    # Print evaluation metrics\n    for k, v in experiment_data.items():\n        test_metrics = dict(v[\"metrics\"][\"test\"][0][1])\n        print(\n            f\"{k} | Test acc={test_metrics['acc']:.4f} macro_f1={test_metrics['macro_f1']:.4f} top3={test_metrics['top3']:.4f} loss={test_metrics['loss']:.4f}\"\n        )\n\n    # Secondary plotting pass strictly from experiment_data.npy (as required)\n    try:\n        experiment_data_loaded = np.load(\n            os.path.join(working_dir, \"experiment_data.npy\"), allow_pickle=True\n        ).item()\n    except Exception as e:\n        print(f\"Error loading experiment data: {e}\")\n        experiment_data_loaded = {}\n    for name, ed in experiment_data_loaded.items():\n        try:\n            # re-plot loss curves from saved data\n            plt.figure()\n            tl = [y for (_, y) in ed.get(\"losses\", {}).get(\"train\", [])]\n            vl = [y for (_, y) in ed.get(\"losses\", {}).get(\"val\", [])]\n            if len(tl) > 0:\n                plt.plot(tl, label=\"train\")\n            if len(vl) > 0:\n                plt.plot(vl, label=\"val\")\n            plt.legend()\n            plt.title(f\"Loss Curves - {name}\\nNext-activity\")\n            plt.xlabel(\"Epoch\")\n            plt.ylabel(\"Loss\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves_reload.png\"))\n            plt.close()\n        except Exception as e:\n            print(f\"Error creating plot1: {e}\")\n            plt.close()\n        try:\n            # confusion matrix from predictions and ground truth\n            y_true = ed.get(\"ground_truth\", [])\n            y_pred = ed.get(\"predictions\", [])\n            if len(y_true) > 0 and len(y_pred) > 0:\n                cm = confusion_matrix(y_true, y_pred)\n                plt.figure(figsize=(5, 4))\n                plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n                plt.colorbar()\n                plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Predicted\")\n                plt.ylabel(\"True\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_confusion_matrix_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot2: {e}\")\n            plt.close()\n        try:\n            # Top-3 vs prefix length if present\n            pref = ed.get(\"prefix_lens\", [])\n            flags = ed.get(\"top3_flags\", [])\n            if len(pref) > 0 and len(flags) > 0:\n                d = defaultdict(list)\n                for L, f in zip(pref, flags):\n                    d[int(L)].append(int(f))\n                xs = sorted(d.keys())\n                ys = [float(np.mean(d[x])) for x in xs]\n                plt.figure()\n                plt.plot(xs, ys, marker=\"o\")\n                plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n                plt.xlabel(\"Prefix Length\")\n                plt.ylabel(\"Top-3 Accuracy\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_top3_vs_prefixlen_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot3: {e}\")\n            plt.close()\n        try:\n            # Macro PR curve if probs available\n            probs = np.array(ed.get(\"probs\", []))\n            y_true = ed.get(\"ground_truth\", [])\n            if probs.size > 0 and len(y_true) > 0:\n                classes = sorted(set(y_true))\n                Y = label_binarize(np.array(y_true), classes=range(probs.shape[1]))\n                present = classes\n                if len(present) > 1:\n                    precisions = []\n                    aps = []\n                    grid = np.linspace(0, 1, 101)\n                    for c in present:\n                        p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                        precisions.append(np.interp(grid, r[::-1], p[::-1]))\n                        aps.append(average_precision_score(Y[:, c], probs[:, c]))\n                    macro_p = np.mean(np.stack(precisions, 0), 0)\n                    plt.figure()\n                    plt.plot(grid, macro_p, label=f\"mAP={np.mean(aps):.3f}\")\n                    plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                    plt.xlabel(\"Recall\")\n                    plt.ylabel(\"Precision\")\n                    plt.legend()\n                    plt.tight_layout()\n                    plt.savefig(\n                        os.path.join(working_dir, f\"{name}_macro_pr_reload.png\")\n                    )\n                    plt.close()\n        except Exception as e:\n            print(f\"Error creating plot4: {e}\")\n            plt.close()\n\n\nif __name__ == \"__main__\":\n    main()\n",
      "plot_analyses": [
        {
          "analysis": "The validation Top-3 Accuracy for BPI2017 shows a consistent improvement over the epochs, indicating that the model is learning effectively and improving its prediction capability for the next activities in the process. This suggests that the tuned hyperparameters are effective for this dataset, as the accuracy continues to increase without signs of overfitting.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_BPI2017.png"
        },
        {
          "analysis": "For BPI2012, the validation Top-3 Accuracy initially increases rapidly but then shows fluctuations. The accuracy peaks around epoch 3-4, followed by a slight decrease. This could indicate that the model has reached its learning capacity for the given hyperparameters or that it may benefit from further tuning or regularization to stabilize the performance.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_BPI2012.png"
        },
        {
          "analysis": "The loss curves for ROAD demonstrate a significant drop in both training and validation loss within the initial epochs, stabilizing towards the end. This suggests that the model is converging well with minimal overfitting, as the training and validation losses are closely aligned. The rapid decrease in loss is promising for the model's predictive performance.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_ROAD.png"
        },
        {
          "analysis": "In BPI2017, the loss curves show a similar pattern to ROAD, with a sharp decline initially followed by stabilization. This indicates effective convergence and suggests that the hyperparameters are well-suited for this dataset, with the model learning efficiently without significant overfitting.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_BPI2017.png"
        },
        {
          "analysis": "Val Top-3 Accuracy for ROAD shows rapid improvement and stabilization, maintaining a high accuracy from epoch 3 onwards. This suggests that the model is very effective for this dataset, achieving strong predictive performance early in the training process. The stability of the accuracy indicates good hyperparameter tuning.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_ROAD.png"
        },
        {
          "analysis": "The loss curves for BPI2012 exhibit a sharp decrease initially, with both training and validation loss aligning closely, which is indicative of good convergence. However, towards the end, there is a slight increase in validation loss, which may be a sign of slight overfitting. Additional regularization or further hyperparameter tuning could be explored to address this.",
          "plot_path": "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_BPI2012.png"
        }
      ],
      "plot_paths": [
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_BPI2017.png",
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_BPI2012.png",
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_ROAD.png",
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_BPI2017.png",
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/val_top3_ROAD.png",
        "experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/loss_curves_BPI2012.png"
      ],
      "vlm_feedback_summary": "The results indicate that the model's hyperparameter tuning is generally effective across the different datasets, with strong predictive performance and convergence demonstrated. However, some datasets show signs of fluctuation or potential overfitting, suggesting room for further optimization or regularization. Overall, the experiments provide valuable insights into the model's capabilities and areas for improvement.",
      "exp_results_dir": "experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839",
      "exp_results_npy_files": [
        "experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/cm_BPI2012.npy",
        "experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/cm_ROAD.npy",
        "experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/experiment_data.npy",
        "experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/cm_BPI2017.npy"
      ]
    },
    {
      "overall_plan": "Stabilize and execute a robust PPM pipeline focused on next-event prediction using a simple LSTM baseline. The primary fixes address a crash caused by pandas Series integer label indexing: convert per-case timestamps to NumPy arrays to guarantee positional indexing and eliminate KeyErrors, and make the prefix generation loop stop at T-1 to ensure a valid next event and avoid off-by-one errors. Retain the established experimental setup: time-based split by case start to prevent leakage, prefix feature construction, LSTM modeling, and proper BPM/PPM metric reporting and saving. The current node serves as a seed/initialization step that reaffirms this stabilized plan without introducing new methodological changes.",
      "analysis": "Primary issue: Feature normalization is applied twice and leaks information across splits. In build_prefix_dataset(), continuous features (deltas and since_start) are normalized using statistics computed over all samples (including validation/test). Later in train_one_dataset(), you attempt to \u201cre-normalize\u201d using train-only stats by applying a z-score again to the already-normalized values. This both introduces leakage (initial global stats influenced test/val) and yields incorrect scaling (z-scoring an already standardized variable). Fix: Remove any normalization from build_prefix_dataset and return raw feature values. After performing the time-based case split, compute mean/std on train samples only and then normalize train/val/test features from their raw values. Implementation options:\n- Easiest: Add a flag normalize=False in build_prefix_dataset (default False) and skip the initial normalization. In train_one_dataset, after splitting, compute dt_mean,dt_std,ss_mean,ss_std from train samples and apply to all splits.\n- Alternatively, have build_prefix_dataset return both raw and normalized features or just raw, and do all normalization post-split.\nSecondary improvements (not hard bugs but recommended):\n- Exclude the PAD class from evaluation/top-k by zeroing its logit or masking it before softmax/topk, and compute macro-F1 over actual activity classes only. Also set ignore_index=pad_idx in CrossEntropyLoss if ever labels could be PAD.\n- Minor: when lifecycle filter removes all events, your fallback keeps all events; consider explicitly handling lifecycle values rather than silently reverting.\nDespite the normalization issue, the run completes and reports per-log next-activity metrics (acc, macro-F1, top-3) with time-based splits on BPI2012, BPI2017, and ROAD as required.",
      "metric": {
        "value": {
          "metric_names": [
            {
              "metric_name": "loss",
              "lower_is_better": true,
              "description": "Cross-entropy loss on the test split; lower values indicate better performance.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5355,
                  "best_value": 0.5355
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.3877,
                  "best_value": 0.3877
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.4833,
                  "best_value": 0.4833
                }
              ]
            },
            {
              "metric_name": "accuracy",
              "lower_is_better": false,
              "description": "Classification accuracy on the test split; proportion of correct predictions.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.7569,
                  "best_value": 0.7569
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.8332,
                  "best_value": 0.8332
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.802,
                  "best_value": 0.802
                }
              ]
            },
            {
              "metric_name": "F1 score",
              "lower_is_better": false,
              "description": "F1 score on the test split; harmonic mean of precision and recall.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.5872,
                  "best_value": 0.5872
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.571,
                  "best_value": 0.571
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.474,
                  "best_value": 0.474
                }
              ]
            },
            {
              "metric_name": "top-3 accuracy",
              "lower_is_better": false,
              "description": "Top-3 accuracy on the test split; proportion where the correct label is among the top 3 predictions.",
              "data": [
                {
                  "dataset_name": "BPI2012",
                  "final_value": 0.9874,
                  "best_value": 0.9874
                },
                {
                  "dataset_name": "BPI2017",
                  "final_value": 0.9906,
                  "best_value": 0.9906
                },
                {
                  "dataset_name": "ROAD",
                  "final_value": 0.9936,
                  "best_value": 0.9936
                }
              ]
            }
          ]
        },
        "maximize": null,
        "name": null,
        "description": null
      },
      "code": "# Set random seed\nimport random\nimport numpy as np\nimport torch\n\nseed = 2\nrandom.seed(seed)\nnp.random.seed(seed)\ntorch.manual_seed(seed)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(seed)\n\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import f1_score, accuracy_score\nimport random\nimport math\nimport matplotlib.pyplot as plt\n\n# Device handling (required)\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n# Experiment data structure\nexperiment_data = {\n    \"BPI2012\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"BPI2017\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n    \"ROAD\": {\n        \"metrics\": {\"train\": [], \"val\": [], \"test\": []},\n        \"losses\": {\"train\": [], \"val\": []},\n        \"predictions\": [],\n        \"ground_truth\": [],\n        \"epochs\": [],\n    },\n}\n\n# Data loading utilities (use provided helper)\nfrom ai_scientist.ideas.my_research_topic import load_datasets, pick_default_dataset\n\n\n# Reproducibility\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# Build prefixes\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    # Keep only 'complete' transitions if lifecycle exists\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        df = df[df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")]\n        if len(df) == 0:\n            df = df.copy()  # fallback if empty\n            df = df.sort_values([\"case_id\", \"timestamp\"])\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    # Build activity vocab\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}  # 0 for PAD\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        # Convert to numpy arrays for safe positional indexing\n        ts_ns = (\n            pd.to_datetime(g[\"timestamp\"], utc=True).astype(\"int64\").to_numpy()\n        )  # nanoseconds\n        ts = (ts_ns // 10**9).astype(np.int64)  # seconds as numpy array\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str).tolist()], dtype=np.int64\n        )\n        # simple calendar features\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        # time deltas and since start in seconds\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(\n            np.float32\n        )  # [T,5]\n        T = len(acts_ids)\n        if T < 2:\n            continue\n        # Generate prefixes of length k (min_prefix_len..min(max_prefix_len, T-1)); target = activity at position k\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            seq_acts = acts_ids[:k].tolist()\n            seq_feats = feats[:k]\n            target = int(acts_ids[k])\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": seq_acts,\n                    \"seq_feats\": seq_feats.copy(),\n                    \"target\": target,\n                    \"last_ts\": int(ts[k - 1]),\n                    \"next_ts\": int(ts[k]),\n                }\n            )\n\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n\n    # Collect feature normalization stats over all feats (initial; will be recomputed on train split)\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n    ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n    for s in samples:\n        if s[\"seq_feats\"].shape[0] > 0:\n            s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n            s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n    return samples, act2id, id2act, pad_id\n\n\n# Time-based split by case start time\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.zeros((pad_len, self.num_cont), dtype=np.float32)\n        feats_pad = np.vstack([feats_pad, feats.astype(np.float32)])\n        attn_mask = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad, dtype=torch.long),\n            \"feats\": torch.tensor(feats_pad, dtype=torch.float32),\n            \"mask\": torch.tensor(attn_mask, dtype=torch.float32),\n            \"y\": torch.tensor(s[\"target\"], dtype=torch.long),\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(\n        self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, num_layers=1, pad_idx=0\n    ):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim,\n            hidden_size=hidden,\n            batch_first=True,\n            num_layers=num_layers,\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)  # includes PAD index\n        self.pad_idx = pad_idx\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)  # [B,T,emb]\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h_last = h[-1]\n        h_last = self.dropout(h_last)\n        logits = self.fc(h_last)\n        return logits\n\n\ndef collate_fn(batch):\n    keys = batch[0].keys()\n    out = {k: torch.stack([b[k] for b in batch], dim=0) for k in keys}\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys, preds_top1, preds_probs = [], [], []\n    top3_correct = 0\n    n_total = 0\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk_idx = torch.topk(probs, k=k_val, dim=1)\n            ys.extend(batch[\"y\"].detach().cpu().tolist())\n            preds_top1.extend(top1.detach().cpu().tolist())\n            preds_probs.append(probs.detach().cpu().numpy())\n            # top-3 correctness\n            for i in range(batch[\"y\"].size(0)):\n                if batch[\"y\"][i].item() in topk_idx[i].detach().cpu().tolist():\n                    top3_correct += 1\n            n_total += batch[\"y\"].size(0)\n    avg_loss = total_loss / max(1, n_total)\n    y_true = np.array(ys)\n    y_pred = np.array(preds_top1)\n    mask = y_true != pad_idx\n    y_true = y_true[mask]\n    y_pred = y_pred[mask]\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except Exception:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n_total))\n    probs_concat = (\n        np.concatenate(preds_probs, axis=0)\n        if len(preds_probs) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return avg_loss, acc, f1, top3, y_true, y_pred, probs_concat\n\n\ndef train_one_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    # Time-based split\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    # Build samples across all to get vocab; we'll re-normalize with train stats\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    # Filter per split\n    samples_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    samples_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    samples_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # Recompute normalization using train samples only\n    if len(samples_train) > 0:\n        concat_feats = [\n            s[\"seq_feats\"] for s in samples_train if s[\"seq_feats\"].shape[0] > 0\n        ]\n        if len(concat_feats) > 0:\n            all_feats = np.concatenate(concat_feats, axis=0)\n            dt_mean, dt_std = all_feats[:, 0].mean(), all_feats[:, 0].std() + 1e-6\n            ss_mean, ss_std = all_feats[:, 1].mean(), all_feats[:, 1].std() + 1e-6\n\n            def norm_samples(samples):\n                for s in samples:\n                    if s[\"seq_feats\"].shape[0] > 0:\n                        s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                        s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n            norm_samples(samples_train)\n            norm_samples(samples_val)\n            norm_samples(samples_test)\n    print(\n        f\"Samples train/val/test: {len(samples_train)}/{len(samples_val)}/{len(samples_test)}; vocab={len(act2id)}\"\n    )\n    if len(samples_train) == 0 or len(act2id) < 2:\n        print(\"Not enough data to train. Skipping.\")\n        return\n    ds_train = PrefixDataset(\n        samples_train, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_val = PrefixDataset(\n        samples_val, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    ds_test = PrefixDataset(\n        samples_test, pad_id=pad_id, max_len=max_prefix_len, num_cont=5\n    )\n    dl_train = DataLoader(\n        ds_train,\n        batch_size=batch_size,\n        shuffle=True,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_val = DataLoader(\n        ds_val,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n    dl_test = DataLoader(\n        ds_test,\n        batch_size=batch_size,\n        shuffle=False,\n        collate_fn=collate_fn,\n        num_workers=0,\n    )\n\n    # Model\n    model = LSTMBaseline(\n        vocab_size=len(act2id),\n        emb_dim=64,\n        cont_dim=5,\n        hidden=128,\n        num_layers=1,\n        pad_idx=pad_id,\n    ).to(device)\n    criterion = nn.CrossEntropyLoss().to(device)\n    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n\n    # Training loop\n    best_val_top3 = -1.0\n    best_state = None\n    hist = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for epoch in range(1, max_epochs + 1):\n        model.train()\n        total = 0\n        running_loss = 0.0\n        for batch in dl_train:\n            batch = {\n                k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)\n            }\n            optimizer.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            loss.backward()\n            optimizer.step()\n            running_loss += loss.item() * logits.size(0)\n            total += logits.size(0)\n        train_loss = running_loss / max(1, total)\n        val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n            model, dl_val, criterion, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {epoch}: validation_loss = {val_loss:.4f} | val_acc={val_acc:.4f} | val_f1={val_f1:.4f} | val_top3={val_top3:.4f}\"\n        )\n        hist[\"train_loss\"].append(train_loss)\n        hist[\"val_loss\"].append(val_loss)\n        hist[\"val_top3\"].append(val_top3)\n        experiment_data[name][\"losses\"][\"train\"].append((epoch, train_loss))\n        experiment_data[name][\"losses\"][\"val\"].append((epoch, val_loss))\n        experiment_data[name][\"metrics\"][\"val\"].append(\n            (epoch, {\"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3})\n        )\n        experiment_data[name][\"epochs\"].append(epoch)\n        if val_top3 > best_val_top3:\n            best_val_top3 = val_top3\n            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}\n\n    # Load best\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n\n    # Final eval on train/val/test\n    train_loss, train_acc, train_f1, train_top3, _, _, _ = evaluate(\n        model, dl_train, criterion, device, len(act2id), pad_id\n    )\n    val_loss, val_acc, val_f1, val_top3, _, _, _ = evaluate(\n        model, dl_val, criterion, device, len(act2id), pad_id\n    )\n    test_loss, test_acc, test_f1, test_top3, y_true_t, y_pred_t, probs_t = evaluate(\n        model, dl_test, criterion, device, len(act2id), pad_id\n    )\n    print(\n        f\"[{name}] Train: loss={train_loss:.4f} acc={train_acc:.4f} f1={train_f1:.4f} top3={train_top3:.4f}\"\n    )\n    print(\n        f\"[{name}] Test:  loss={test_loss:.4f} acc={test_acc:.4f} f1={test_f1:.4f} top3={test_top3:.4f}\"\n    )\n\n    # Save metrics\n    experiment_data[name][\"metrics\"][\"train\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": train_loss,\n                \"acc\": train_acc,\n                \"macro_f1\": train_f1,\n                \"top3\": train_top3,\n            },\n        )\n    )\n    experiment_data[name][\"metrics\"][\"val\"].append(\n        (\n            \"final\",\n            {\"loss\": val_loss, \"acc\": val_acc, \"macro_f1\": val_f1, \"top3\": val_top3},\n        )\n    )\n    experiment_data[name][\"metrics\"][\"test\"].append(\n        (\n            \"final\",\n            {\n                \"loss\": test_loss,\n                \"acc\": test_acc,\n                \"macro_f1\": test_f1,\n                \"top3\": test_top3,\n            },\n        )\n    )\n    experiment_data[name][\"predictions\"] = y_pred_t.tolist()\n    experiment_data[name][\"ground_truth\"] = y_true_t.tolist()\n\n    # Plots\n    try:\n        plt.figure()\n        plt.plot(hist[\"train_loss\"], label=\"train_loss\")\n        plt.plot(hist[\"val_loss\"], label=\"val_loss\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"loss_curves_{name}.png\"))\n        plt.close()\n\n        plt.figure()\n        plt.plot(hist[\"val_top3\"], label=\"val_top3\")\n        plt.legend()\n        plt.title(f\"Val Top-3 Acc - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Top-3 Acc\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"val_top3_{name}.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"[warn] Plotting failed: {e}\")\n\n    # Save confusion matrix-like data (optional)\n    try:\n        from sklearn.metrics import confusion_matrix\n\n        cm = confusion_matrix(y_true_t, y_pred_t)\n        np.save(os.path.join(working_dir, f\"cm_{name}.npy\"), cm)\n    except Exception as e:\n        print(f\"[warn] Confusion matrix failed: {e}\")\n\n\ndef main():\n    datasets = load_datasets()\n    # Loop through loaded datasets; cap to 5000 earliest cases for speed\n    for key, df in datasets.items():\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep_cases = set(starts.iloc[:5000][\"case_id\"])\n                df_small = df[df[\"case_id\"].isin(keep_cases)].copy()\n            else:\n                df_small = df\n        except Exception:\n            df_small = df\n        train_one_dataset(\n            key, df_small, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    np.savez_compressed(\n        os.path.join(working_dir, \"experiment_data_compressed.npz\"),\n        data=experiment_data,\n    )\n\n\n# Execute immediately\nmain()\n",
      "plot_code": "import matplotlib.pyplot as plt\nimport numpy as np\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nimport pandas as pd\nimport torch\nfrom torch import nn\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import (\n    f1_score,\n    accuracy_score,\n    confusion_matrix,\n    precision_recall_curve,\n    average_precision_score,\n)\nfrom sklearn.preprocessing import label_binarize\nimport random\nimport math\nfrom collections import defaultdict\n\n# Reproducibility and device\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using device: {device}\")\n\n\ndef set_seed(seed=42):\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed_all(seed)\n\n\nset_seed(42)\n\n\n# --------- Data loading from local XES via pm4py ----------\ndef load_xes_folder(data_dir=\"data\"):\n    datasets = {}\n    try:\n        import pm4py\n    except Exception as e:\n        print(f\"pm4py not available: {e}\")\n        return datasets\n    if not os.path.isdir(data_dir):\n        print(f\"Data directory not found: {data_dir}\")\n        return datasets\n    for fn in os.listdir(data_dir):\n        if fn.lower().endswith(\".xes\") or fn.lower().endswith(\".xes.gz\"):\n            path = os.path.join(data_dir, fn)\n            try:\n                log = pm4py.read_xes(path)\n                df = pm4py.convert_to_dataframe(log)\n                # Standardize columns\n                # pm4py dataframe typically has case:concept:name, concept:name, time:timestamp, lifecycle:transition\n                cols = df.columns\n                case_col = (\n                    \"case:concept:name\"\n                    if \"case:concept:name\" in cols\n                    else (\"case\" if \"case\" in cols else None)\n                )\n                act_col = (\n                    \"concept:name\"\n                    if \"concept:name\" in cols\n                    else (\"activity\" if \"activity\" in cols else None)\n                )\n                ts_col = (\n                    \"time:timestamp\"\n                    if \"time:timestamp\" in cols\n                    else (\"timestamp\" if \"timestamp\" in cols else None)\n                )\n                life_col = (\n                    \"lifecycle:transition\"\n                    if \"lifecycle:transition\" in cols\n                    else (\"lifecycle\" if \"lifecycle\" in cols else None)\n                )\n                if case_col is None or act_col is None or ts_col is None:\n                    print(f\"Missing required columns in {fn}, skipping.\")\n                    continue\n                out = pd.DataFrame(\n                    {\n                        \"case_id\": df[case_col].astype(str).values,\n                        \"activity\": df[act_col].astype(str).values,\n                        \"timestamp\": pd.to_datetime(df[ts_col], utc=True),\n                    }\n                )\n                if life_col is not None:\n                    out[\"lifecycle\"] = df[life_col].astype(str).values\n                name = os.path.splitext(fn)[0]\n                datasets[name] = out\n                print(\n                    f\"Loaded {name}: {len(out)} events, {out['case_id'].nunique()} cases\"\n                )\n            except Exception as e:\n                print(f\"Failed to load {fn}: {e}\")\n    return datasets\n\n\n# --------- Prefix building and split ----------\ndef build_prefix_dataset(df, max_prefix_len=10, min_prefix_len=1):\n    df = df.copy()\n    if \"lifecycle\" in df.columns:\n        mask = df[\"lifecycle\"].astype(str).str.lower().eq(\"complete\")\n        if mask.any():\n            df = df[mask]\n    df = df.sort_values([\"case_id\", \"timestamp\"])\n    acts = df[\"activity\"].astype(str).unique().tolist()\n    act2id = {a: i + 1 for i, a in enumerate(sorted(acts))}\n    id2act = {i: a for a, i in act2id.items()}\n    pad_id = 0\n    samples = []\n    for cid, g in df.groupby(\"case_id\"):\n        g = g.sort_values(\"timestamp\")\n        if len(g) < 2:\n            continue\n        g_ts = pd.to_datetime(g[\"timestamp\"], utc=True)\n        ts = (g_ts.astype(\"int64\") // 10**9).to_numpy(np.int64)\n        acts_ids = np.array(\n            [act2id[a] for a in g[\"activity\"].astype(str)], dtype=np.int64\n        )\n        hours = (g_ts.dt.hour.to_numpy(dtype=float) / 23.0).astype(np.float32)\n        weekdays = (g_ts.dt.weekday.to_numpy(dtype=float) / 6.0).astype(np.float32)\n        working = (\n            (g_ts.dt.weekday.to_numpy() < 5)\n            & (g_ts.dt.hour.to_numpy() >= 8)\n            & (g_ts.dt.hour.to_numpy() <= 17)\n        ).astype(np.float32)\n        deltas = np.diff(ts, prepend=ts[0]).astype(np.float32)\n        since_start = (ts - ts[0]).astype(np.float32)\n        feats = np.stack(\n            [deltas, since_start, hours, weekdays, working], axis=1\n        ).astype(np.float32)\n        T = len(acts_ids)\n        max_k = min(max_prefix_len, T - 1)\n        for k in range(min_prefix_len, max_k + 1):\n            samples.append(\n                {\n                    \"case_id\": cid,\n                    \"seq_acts\": acts_ids[:k].tolist(),\n                    \"seq_feats\": feats[:k].copy(),\n                    \"target\": int(acts_ids[k]),\n                    \"prefix_len\": k,\n                }\n            )\n    if len(samples) == 0:\n        return samples, act2id, id2act, pad_id\n    all_feats = np.concatenate(\n        [s[\"seq_feats\"] for s in samples if len(s[\"seq_feats\"]) > 0], axis=0\n    )\n    for s in samples:\n        pass  # initial no norm; will norm on train split\n    return samples, act2id, id2act, pad_id\n\n\ndef time_based_split(df, train_frac=0.7, val_frac=0.15):\n    starts = (\n        df.sort_values(\"timestamp\").groupby(\"case_id\")[\"timestamp\"].min().reset_index()\n    )\n    starts = starts.sort_values(\"timestamp\").reset_index(drop=True)\n    n = len(starts)\n    n_train = int(n * train_frac)\n    n_val = int(n * val_frac)\n    train_cases = set(starts.iloc[:n_train][\"case_id\"])\n    val_cases = set(starts.iloc[n_train : n_train + n_val][\"case_id\"])\n    test_cases = set(starts.iloc[n_train + n_val :][\"case_id\"])\n    return train_cases, val_cases, test_cases\n\n\nclass PrefixDataset(Dataset):\n    def __init__(self, samples, pad_id, max_len=10, num_cont=5):\n        self.samples = samples\n        self.pad_id = pad_id\n        self.max_len = max_len\n        self.num_cont = num_cont\n\n    def __len__(self):\n        return len(self.samples)\n\n    def __getitem__(self, idx):\n        s = self.samples[idx]\n        seq = s[\"seq_acts\"][-self.max_len :]\n        feats = s[\"seq_feats\"][-self.max_len :]\n        L = len(seq)\n        pad_len = self.max_len - L\n        seq_pad = [self.pad_id] * pad_len + seq\n        feats_pad = np.vstack(\n            [\n                np.zeros((pad_len, self.num_cont), dtype=np.float32),\n                feats.astype(np.float32),\n            ]\n        )\n        attn = np.array([0] * pad_len + [1] * L, dtype=np.float32)\n        return {\n            \"acts\": torch.tensor(seq_pad).long(),\n            \"feats\": torch.tensor(feats_pad).float(),\n            \"mask\": torch.tensor(attn).float(),\n            \"y\": torch.tensor(s[\"target\"]).long(),\n            \"prefix_len\": L,\n        }\n\n\nclass LSTMBaseline(nn.Module):\n    def __init__(self, vocab_size, emb_dim=64, cont_dim=5, hidden=128, pad_idx=0):\n        super().__init__()\n        self.emb = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=pad_idx)\n        self.lstm = nn.LSTM(\n            input_size=emb_dim + cont_dim, hidden_size=hidden, batch_first=True\n        )\n        self.dropout = nn.Dropout(0.2)\n        self.fc = nn.Linear(hidden, vocab_size + 1)\n\n    def forward(self, acts, feats, mask):\n        x = self.emb(acts)\n        x = torch.cat([x, feats], dim=-1)\n        out, (h, c) = self.lstm(x)\n        h = self.dropout(h[-1])\n        return self.fc(h)\n\n\ndef collate_fn(batch):\n    out = {\n        k: (\n            torch.stack([b[k] for b in batch], 0)\n            if isinstance(batch[0][k], torch.Tensor)\n            else [b[k] for b in batch]\n        )\n        for k in batch[0].keys()\n    }\n    return out\n\n\ndef evaluate(model, loader, criterion, device, num_classes, pad_idx):\n    model.eval()\n    total_loss = 0.0\n    ys = []\n    yhat = []\n    probs_list = []\n    n = 0\n    top3_correct = 0\n    pref_lens = []\n    top3_flags = []\n    with torch.no_grad():\n        for batch in loader:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = criterion(logits, batch[\"y\"])\n            total_loss += loss.item() * logits.size(0)\n            probs = torch.softmax(logits, dim=1)\n            top1 = torch.argmax(probs, dim=1)\n            k_val = min(3, probs.size(1))\n            _, topk = torch.topk(probs, k=k_val, dim=1)\n            y = batch[\"y\"]\n            ys.extend(y.detach().cpu().tolist())\n            yhat.extend(top1.detach().cpu().tolist())\n            probs_list.append(probs.detach().cpu().numpy())\n            for i in range(y.size(0)):\n                flag = int(y[i].item() in topk[i].detach().cpu().tolist())\n                top3_correct += flag\n                top3_flags.append(flag)\n                pref_lens.append(int(batch[\"prefix_len\"][i].item()))\n            n += y.size(0)\n    avg_loss = total_loss / max(1, n)\n    y_true = np.array(ys)\n    y_pred = np.array(yhat)\n    acc = float(accuracy_score(y_true, y_pred)) if len(y_true) > 0 else 0.0\n    try:\n        f1 = float(f1_score(y_true, y_pred, average=\"macro\"))\n    except:\n        f1 = 0.0\n    top3 = float(top3_correct / max(1, n))\n    probs_concat = (\n        np.concatenate(probs_list, axis=0)\n        if len(probs_list) > 0\n        else np.zeros((0, num_classes + 1))\n    )\n    return (\n        avg_loss,\n        acc,\n        f1,\n        top3,\n        y_true,\n        y_pred,\n        probs_concat,\n        np.array(pref_lens),\n        np.array(top3_flags),\n    )\n\n\ndef train_on_dataset(\n    name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n):\n    print(f\"\\n=== Dataset: {name} ===\")\n    train_cases, val_cases, test_cases = time_based_split(df, 0.7, 0.15)\n    samples_all, act2id, id2act, pad_id = build_prefix_dataset(\n        df, max_prefix_len=max_prefix_len\n    )\n    s_train = [s for s in samples_all if s[\"case_id\"] in train_cases]\n    s_val = [s for s in samples_all if s[\"case_id\"] in val_cases]\n    s_test = [s for s in samples_all if s[\"case_id\"] in test_cases]\n    # normalize time features on train\n    if len(s_train) > 0:\n        feats = np.concatenate(\n            [s[\"seq_feats\"] for s in s_train if len(s[\"seq_feats\"]) > 0], axis=0\n        )\n        dt_mean, dt_std = feats[:, 0].mean(), feats[:, 0].std() + 1e-6\n        ss_mean, ss_std = feats[:, 1].mean(), feats[:, 1].std() + 1e-6\n\n        def norm(samples):\n            for s in samples:\n                if s[\"seq_feats\"].shape[0] > 0:\n                    s[\"seq_feats\"][:, 0] = (s[\"seq_feats\"][:, 0] - dt_mean) / dt_std\n                    s[\"seq_feats\"][:, 1] = (s[\"seq_feats\"][:, 1] - ss_mean) / ss_std\n\n        norm(s_train)\n        norm(s_val)\n        norm(s_test)\n    print(\n        f\"Samples train/val/test: {len(s_train)}/{len(s_val)}/{len(s_test)}; vocab={len(act2id)}\"\n    )\n    if len(s_train) == 0 or len(act2id) < 2:\n        print(\"Insufficient data; skipping.\")\n        return None\n    ds_tr = PrefixDataset(s_train, pad_id, max_prefix_len, 5)\n    ds_va = PrefixDataset(s_val, pad_id, max_prefix_len, 5)\n    ds_te = PrefixDataset(s_test, pad_id, max_prefix_len, 5)\n    dl_tr = DataLoader(\n        ds_tr, batch_size=batch_size, shuffle=True, collate_fn=collate_fn\n    )\n    dl_va = DataLoader(\n        ds_va, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    dl_te = DataLoader(\n        ds_te, batch_size=batch_size, shuffle=False, collate_fn=collate_fn\n    )\n    model = LSTMBaseline(\n        vocab_size=len(act2id), emb_dim=64, cont_dim=5, hidden=128, pad_idx=pad_id\n    ).to(device)\n    crit = nn.CrossEntropyLoss().to(device)\n    opt = torch.optim.Adam(model.parameters(), lr=lr)\n    best_top3 = -1.0\n    best_state = None\n    history = {\"train_loss\": [], \"val_loss\": [], \"val_top3\": []}\n    for ep in range(1, max_epochs + 1):\n        model.train()\n        tot = 0\n        run_loss = 0.0\n        for batch in dl_tr:\n            batch = {\n                k: v.to(device) if isinstance(v, torch.Tensor) else v\n                for k, v in batch.items()\n            }\n            opt.zero_grad()\n            logits = model(batch[\"acts\"], batch[\"feats\"], batch[\"mask\"])\n            loss = crit(logits, batch[\"y\"])\n            loss.backward()\n            opt.step()\n            run_loss += loss.item() * logits.size(0)\n            tot += logits.size(0)\n        tr_loss = run_loss / max(1, tot)\n        va_loss, va_acc, va_f1, va_top3, *_ = evaluate(\n            model, dl_va, crit, device, len(act2id), pad_id\n        )\n        print(\n            f\"Epoch {ep}: val_loss={va_loss:.4f} acc={va_acc:.4f} f1={va_f1:.4f} top3={va_top3:.4f}\"\n        )\n        history[\"train_loss\"].append(tr_loss)\n        history[\"val_loss\"].append(va_loss)\n        history[\"val_top3\"].append(va_top3)\n        if va_top3 > best_top3:\n            best_top3 = va_top3\n            best_state = {\n                k: v.detach().cpu().clone() for k, v in model.state_dict().items()\n            }\n    if best_state is not None:\n        model.load_state_dict(best_state)\n        model.to(device)\n    tr_loss, tr_acc, tr_f1, tr_top3, *_ = evaluate(\n        model, dl_tr, crit, device, len(act2id), pad_id\n    )\n    te_loss, te_acc, te_f1, te_top3, y_true, y_pred, probs, pref_lens, top3_flags = (\n        evaluate(model, dl_te, crit, device, len(act2id), pad_id)\n    )\n    print(\n        f\"[{name}] Test: loss={te_loss:.4f} acc={te_acc:.4f} f1={te_f1:.4f} top3={te_top3:.4f}\"\n    )\n    # package experiment data\n    exp = {\n        \"metrics\": {\n            \"train\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": tr_loss,\n                        \"acc\": tr_acc,\n                        \"macro_f1\": tr_f1,\n                        \"top3\": tr_top3,\n                    },\n                )\n            ],\n            \"val\": [],\n            \"test\": [\n                (\n                    \"final\",\n                    {\n                        \"loss\": te_loss,\n                        \"acc\": te_acc,\n                        \"macro_f1\": te_f1,\n                        \"top3\": te_top3,\n                    },\n                )\n            ],\n        },\n        \"losses\": {\n            \"train\": list(enumerate(history[\"train_loss\"], start=1)),\n            \"val\": list(enumerate(history[\"val_loss\"], start=1)),\n        },\n        \"predictions\": y_pred.tolist(),\n        \"ground_truth\": y_true.tolist(),\n        \"epochs\": list(range(1, len(history[\"train_loss\"]) + 1)),\n        \"probs\": probs,\n        \"prefix_lens\": pref_lens.tolist(),\n        \"top3_flags\": top3_flags.tolist(),\n        \"act2id\": act2id,\n    }\n    # plots for this dataset\n    try:\n        plt.figure()\n        plt.plot(history[\"train_loss\"], label=\"train\")\n        plt.plot(history[\"val_loss\"], label=\"val\")\n        plt.legend()\n        plt.title(f\"Loss Curves - {name}\")\n        plt.xlabel(\"Epoch\")\n        plt.ylabel(\"Loss\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating loss curves for {name}: {e}\")\n        plt.close()\n    try:\n        cm = confusion_matrix(y_true, y_pred)\n        plt.figure(figsize=(5, 4))\n        plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n        plt.colorbar()\n        plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n        plt.xlabel(\"Predicted\")\n        plt.ylabel(\"True\")\n        plt.tight_layout()\n        plt.savefig(os.path.join(working_dir, f\"{name}_confusion_matrix.png\"))\n        plt.close()\n    except Exception as e:\n        print(f\"Error creating confusion matrix for {name}: {e}\")\n        plt.close()\n    try:\n        # Top-3 accuracy vs prefix length\n        if len(pref_lens) > 0:\n            d = defaultdict(list)\n            for L, flag in zip(pref_lens, top3_flags):\n                d[int(L)].append(int(flag))\n            xs = sorted(d.keys())\n            ys = [np.mean(d[k]) for k in xs]\n            plt.figure()\n            plt.plot(xs, ys, marker=\"o\")\n            plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n            plt.xlabel(\"Prefix Length\")\n            plt.ylabel(\"Top-3 Accuracy\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_top3_vs_prefixlen.png\"))\n            plt.close()\n    except Exception as e:\n        print(f\"Error creating Top-3 vs prefix length for {name}: {e}\")\n        plt.close()\n    try:\n        # Macro PR curve (one-vs-rest); may be coarse due to many classes\n        if probs.shape[0] > 0:\n            classes = np.unique(y_true)\n            Y = label_binarize(y_true, classes=range(probs.shape[1]))\n            # only keep columns present in classes to avoid PAD\n            present = [c for c in classes]\n            if len(present) > 1:\n                precisions = []\n                recalls = []\n                aps = []\n                for c in present:\n                    p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                    ap = average_precision_score(Y[:, c], probs[:, c])\n                    precisions.append(\n                        np.interp(np.linspace(0, 1, 101), r[::-1], p[::-1])\n                    )\n                    recalls.append(np.linspace(0, 1, 101))\n                    aps.append(ap)\n                macro_p = np.mean(np.stack(precisions, 0), 0)\n                macro_r = np.linspace(0, 1, 101)\n                plt.figure()\n                plt.plot(macro_r, macro_p, label=f\"Macro-PR (mAP={np.mean(aps):.3f})\")\n                plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Recall\")\n                plt.ylabel(\"Precision\")\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(os.path.join(working_dir, f\"{name}_macro_pr.png\"))\n                plt.close()\n    except Exception as e:\n        print(f\"Error creating PR curve for {name}: {e}\")\n        plt.close()\n    return name, exp\n\n\ndef main():\n    datasets = load_xes_folder(data_dir=os.path.join(os.getcwd(), \"data\"))\n    experiment_data = {}\n    for name, df in datasets.items():\n        # optional cap earliest 5000 cases\n        try:\n            starts = (\n                df.sort_values(\"timestamp\")\n                .groupby(\"case_id\")[\"timestamp\"]\n                .min()\n                .reset_index()\n            )\n            if len(starts) > 5000:\n                keep = set(starts.iloc[:5000][\"case_id\"])\n                df = df[df[\"case_id\"].isin(keep)].copy()\n        except:\n            pass\n        res = train_on_dataset(\n            name, df, max_epochs=10, batch_size=128, max_prefix_len=10, lr=1e-3\n        )\n        if res is not None:\n            k, exp = res\n            experiment_data[k] = exp\n    # Save experiment data\n    np.save(os.path.join(working_dir, \"experiment_data.npy\"), experiment_data)\n    # Print evaluation metrics\n    for k, v in experiment_data.items():\n        test_metrics = dict(v[\"metrics\"][\"test\"][0][1])\n        print(\n            f\"{k} | Test acc={test_metrics['acc']:.4f} macro_f1={test_metrics['macro_f1']:.4f} top3={test_metrics['top3']:.4f} loss={test_metrics['loss']:.4f}\"\n        )\n\n    # Secondary plotting pass strictly from experiment_data.npy (as required)\n    try:\n        experiment_data_loaded = np.load(\n            os.path.join(working_dir, \"experiment_data.npy\"), allow_pickle=True\n        ).item()\n    except Exception as e:\n        print(f\"Error loading experiment data: {e}\")\n        experiment_data_loaded = {}\n    for name, ed in experiment_data_loaded.items():\n        try:\n            # re-plot loss curves from saved data\n            plt.figure()\n            tl = [y for (_, y) in ed.get(\"losses\", {}).get(\"train\", [])]\n            vl = [y for (_, y) in ed.get(\"losses\", {}).get(\"val\", [])]\n            if len(tl) > 0:\n                plt.plot(tl, label=\"train\")\n            if len(vl) > 0:\n                plt.plot(vl, label=\"val\")\n            plt.legend()\n            plt.title(f\"Loss Curves - {name}\\nNext-activity\")\n            plt.xlabel(\"Epoch\")\n            plt.ylabel(\"Loss\")\n            plt.tight_layout()\n            plt.savefig(os.path.join(working_dir, f\"{name}_loss_curves_reload.png\"))\n            plt.close()\n        except Exception as e:\n            print(f\"Error creating plot1: {e}\")\n            plt.close()\n        try:\n            # confusion matrix from predictions and ground truth\n            y_true = ed.get(\"ground_truth\", [])\n            y_pred = ed.get(\"predictions\", [])\n            if len(y_true) > 0 and len(y_pred) > 0:\n                cm = confusion_matrix(y_true, y_pred)\n                plt.figure(figsize=(5, 4))\n                plt.imshow(cm, aspect=\"auto\", cmap=\"Blues\")\n                plt.colorbar()\n                plt.title(f\"Confusion Matrix (Test) - {name}\\nNext-activity\")\n                plt.xlabel(\"Predicted\")\n                plt.ylabel(\"True\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_confusion_matrix_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot2: {e}\")\n            plt.close()\n        try:\n            # Top-3 vs prefix length if present\n            pref = ed.get(\"prefix_lens\", [])\n            flags = ed.get(\"top3_flags\", [])\n            if len(pref) > 0 and len(flags) > 0:\n                d = defaultdict(list)\n                for L, f in zip(pref, flags):\n                    d[int(L)].append(int(f))\n                xs = sorted(d.keys())\n                ys = [float(np.mean(d[x])) for x in xs]\n                plt.figure()\n                plt.plot(xs, ys, marker=\"o\")\n                plt.title(f\"Top-3 Accuracy vs Prefix Length - {name}\\nNext-activity\")\n                plt.xlabel(\"Prefix Length\")\n                plt.ylabel(\"Top-3 Accuracy\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{name}_top3_vs_prefixlen_reload.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating plot3: {e}\")\n            plt.close()\n        try:\n            # Macro PR curve if probs available\n            probs = np.array(ed.get(\"probs\", []))\n            y_true = ed.get(\"ground_truth\", [])\n            if probs.size > 0 and len(y_true) > 0:\n                classes = sorted(set(y_true))\n                Y = label_binarize(np.array(y_true), classes=range(probs.shape[1]))\n                present = classes\n                if len(present) > 1:\n                    precisions = []\n                    aps = []\n                    grid = np.linspace(0, 1, 101)\n                    for c in present:\n                        p, r, _ = precision_recall_curve(Y[:, c], probs[:, c])\n                        precisions.append(np.interp(grid, r[::-1], p[::-1]))\n                        aps.append(average_precision_score(Y[:, c], probs[:, c]))\n                    macro_p = np.mean(np.stack(precisions, 0), 0)\n                    plt.figure()\n                    plt.plot(grid, macro_p, label=f\"mAP={np.mean(aps):.3f}\")\n                    plt.title(f\"Macro Precision-Recall (Test) - {name}\\nNext-activity\")\n                    plt.xlabel(\"Recall\")\n                    plt.ylabel(\"Precision\")\n                    plt.legend()\n                    plt.tight_layout()\n                    plt.savefig(\n                        os.path.join(working_dir, f\"{name}_macro_pr_reload.png\")\n                    )\n                    plt.close()\n        except Exception as e:\n            print(f\"Error creating plot4: {e}\")\n            plt.close()\n\n\nif __name__ == \"__main__\":\n    main()\n",
      "plot_analyses": [],
      "plot_paths": [],
      "vlm_feedback_summary": []
    }
  ],
  "aggregated results of nodes with different seeds": {
    "overall_plan": "Stabilize and validate the existing PPM pipeline and then strengthen evaluation through multi-seed aggregation. Specifically, we fixed a crash by converting per-case timestamps from pandas Series to numpy arrays for reliable positional indexing, and adjusted the prefix generation loop to stop at T-1 to ensure a next event exists, eliminating off-by-one errors. We intentionally kept the rest of the pipeline unchanged: time-based case-start splitting, prefix feature construction, a simple LSTM baseline, and computation/saving of the required BPM/PPM metrics. With the pipeline now stable, the current step is to run multiple random seeds and aggregate the resulting metrics to assess robustness and variability, providing more reliable performance summaries for the baseline under the same time-based split and feature setup.",
    "metric": {
      "value": null,
      "maximize": null,
      "name": null,
      "description": null
    },
    "code": "# plotting aggregation code",
    "plot_code": "import matplotlib.pyplot as plt\nimport numpy as np\nimport os\n\nworking_dir = os.path.join(os.getcwd(), \"working\")\nos.makedirs(working_dir, exist_ok=True)\n\nfrom collections import defaultdict\nfrom sklearn.metrics import confusion_matrix\n\n\ndef safe_load_experiment(path_str):\n    try:\n        # Try environment-rooted path\n        root = os.getenv(\"AI_SCIENTIST_ROOT\")\n        if root is not None and len(root) > 0:\n            p = os.path.join(root, path_str)\n            if os.path.isfile(p):\n                return np.load(p, allow_pickle=True).item()\n        # Fallback: treat as relative/absolute path\n        if os.path.isfile(path_str):\n            return np.load(path_str, allow_pickle=True).item()\n        # Also try under working dir if provided as None/experiment_data.npy pattern\n        p2 = os.path.join(working_dir, os.path.basename(path_str))\n        if os.path.isfile(p2):\n            return np.load(p2, allow_pickle=True).item()\n    except Exception as e:\n        print(f\"Error loading experiment data from {path_str}: {e}\")\n    return None\n\n\ndef sem(a, axis=0):\n    a = np.array(a, dtype=float)\n    if a.size == 0:\n        return np.array([])\n    # count non-nan along axis\n    n = np.sum(~np.isnan(a), axis=axis)\n    std = np.nanstd(a, axis=axis, ddof=1)\n    with np.errstate(invalid=\"ignore\", divide=\"ignore\"):\n        se = std / np.sqrt(np.maximum(n, 1))\n    return se\n\n\ndef downsample_xs(xs, max_points=5):\n    xs_sorted = np.array(sorted(xs))\n    if len(xs_sorted) <= max_points:\n        return xs_sorted.tolist()\n    # pick approx quantile positions\n    qs = np.linspace(0, 1, num=max_points)\n    idx = np.unique(\n        np.clip((qs * (len(xs_sorted) - 1)).round().astype(int), 0, len(xs_sorted) - 1)\n    )\n    return xs_sorted[idx].tolist()\n\n\ndef main():\n    # Collect all experiment data dicts\n    try:\n        experiment_data_path_list = [\n            \"None/experiment_data.npy\",\n            \"experiments/2025-09-13_11-32-42_resource_centric_ppm_agents_attempt_0/logs/0-run/experiment_results/experiment_57ccd5de0de34674be3985e9f94220ad_proc_361839/experiment_data.npy\",\n            \"None/experiment_data.npy\",\n        ]\n        all_experiment_data = []\n        for experiment_data_path in experiment_data_path_list:\n            data = safe_load_experiment(experiment_data_path)\n            if data is not None and isinstance(data, dict) and len(data) > 0:\n                all_experiment_data.append(data)\n            else:\n                print(\n                    f\"Skipped loading from {experiment_data_path} (missing or empty).\"\n                )\n        if len(all_experiment_data) == 0:\n            print(\"No experiment_data loaded; nothing to plot.\")\n            return\n    except Exception as e:\n        print(f\"Error loading experiment data: {e}\")\n        all_experiment_data = []\n        return\n\n    # Build dataset -> list of run dicts\n    datasets_runs = defaultdict(list)\n    for run_idx, exp_dict in enumerate(all_experiment_data):\n        for ds_name, ds_payload in exp_dict.items():\n            datasets_runs[ds_name].append(ds_payload)\n\n    # Aggregate and plot per dataset\n    for ds_name, runs in datasets_runs.items():\n        # Aggregate losses (train/val) by aligning epochs to min length\n        try:\n            # Collect lists of loss sequences\n            train_losses = []\n            val_losses = []\n            for r in runs:\n                tr = r.get(\"losses\", {}).get(\"train\", [])\n                vl = r.get(\"losses\", {}).get(\"val\", [])\n                # entries are list of (epoch, value) or just values; normalize to values\n                tr_vals = (\n                    [y for (_, y) in tr]\n                    if len(tr) > 0 and isinstance(tr[0], (list, tuple))\n                    else list(tr)\n                )\n                vl_vals = (\n                    [y for (_, y) in vl]\n                    if len(vl) > 0 and isinstance(vl[0], (list, tuple))\n                    else list(vl)\n                )\n                if len(tr_vals) > 0:\n                    train_losses.append(np.array(tr_vals, dtype=float))\n                if len(vl_vals) > 0:\n                    val_losses.append(np.array(vl_vals, dtype=float))\n            # Plot if we have at least one train or val\n            if len(train_losses) > 0 or len(val_losses) > 0:\n                plt.figure()\n                subtitle = \"Aggregated across runs | Next-activity\"\n                if len(train_losses) > 0:\n                    L = min([len(a) for a in train_losses])\n                    TL = np.stack([a[:L] for a in train_losses], axis=0)\n                    mean_tr = np.nanmean(TL, axis=0)\n                    sem_tr = sem(TL, axis=0)\n                    xs = np.arange(1, L + 1)\n                    plt.plot(xs, mean_tr, label=\"Train (Mean)\", color=\"tab:blue\")\n                    plt.fill_between(\n                        xs,\n                        mean_tr - sem_tr,\n                        mean_tr + sem_tr,\n                        color=\"tab:blue\",\n                        alpha=0.2,\n                        label=\"Train (SEM)\",\n                    )\n                if len(val_losses) > 0:\n                    L = min([len(a) for a in val_losses])\n                    VL = np.stack([a[:L] for a in val_losses], axis=0)\n                    mean_va = np.nanmean(VL, axis=0)\n                    sem_va = sem(VL, axis=0)\n                    xs = np.arange(1, L + 1)\n                    plt.plot(xs, mean_va, label=\"Val (Mean)\", color=\"tab:orange\")\n                    plt.fill_between(\n                        xs,\n                        mean_va - sem_va,\n                        mean_va + sem_va,\n                        color=\"tab:orange\",\n                        alpha=0.2,\n                        label=\"Val (SEM)\",\n                    )\n                plt.legend()\n                plt.xlabel(\"Epoch\")\n                plt.ylabel(\"Loss\")\n                plt.title(f\"{ds_name} - Loss Curves\\n{subtitle}\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{ds_name}_aggregated_loss_curves.png\")\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating aggregated loss curves for {ds_name}: {e}\")\n            plt.close()\n\n        # Aggregate test metrics across runs and bar plot with error bars\n        try:\n            metrics_list = []\n            for r in runs:\n                test_entries = r.get(\"metrics\", {}).get(\"test\", [])\n                if isinstance(test_entries, list) and len(test_entries) > 0:\n                    # take first entry's dict\n                    m = (\n                        dict(test_entries[0][1])\n                        if isinstance(test_entries[0], (list, tuple))\n                        else dict(test_entries[0])\n                    )\n                    # Only keep numeric keys\n                    usable = {\n                        k: float(v)\n                        for k, v in m.items()\n                        if isinstance(v, (int, float, np.floating))\n                    }\n                    if len(usable) > 0:\n                        metrics_list.append(usable)\n            if len(metrics_list) > 0:\n                keys = [\"acc\", \"macro_f1\", \"top3\", \"loss\"]\n                vals = []\n                for k in keys:\n                    arr = [d.get(k, np.nan) for d in metrics_list]\n                    vals.append(arr)\n                means = [np.nanmean(a) for a in vals]\n                errors = [sem(a) for a in vals]\n                xs = np.arange(len(keys))\n                plt.figure()\n                plt.bar(\n                    xs,\n                    means,\n                    yerr=errors,\n                    capsize=4,\n                    color=[\"tab:green\", \"tab:purple\", \"tab:red\", \"tab:gray\"],\n                    alpha=0.8,\n                    label=\"Mean \u00b1 SEM\",\n                )\n                plt.xticks(xs, keys)\n                plt.ylabel(\"Value\")\n                plt.title(\n                    f\"{ds_name} - Aggregated Test Metrics\\nAggregated across runs | Next-activity\"\n                )\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(working_dir, f\"{ds_name}_aggregated_test_metrics.png\")\n                )\n                plt.close()\n                # Print metrics\n                print(\n                    f\"{ds_name} | Test metrics (mean \u00b1 SEM): \"\n                    + \", \".join(\n                        [f\"{k}={m:.4f}\u00b1{e:.4f}\" for k, m, e in zip(keys, means, errors)]\n                    )\n                )\n        except Exception as e:\n            print(f\"Error creating aggregated test metrics for {ds_name}: {e}\")\n            plt.close()\n\n        # Aggregate Top-3 vs Prefix Length across runs (mean \u00b1 SEM), downsample xs to at most 5\n        try:\n            per_run_maps = []\n            all_L = set()\n            for r in runs:\n                pref = r.get(\"prefix_lens\", [])\n                flags = r.get(\"top3_flags\", [])\n                if len(pref) > 0 and len(flags) > 0:\n                    d = defaultdict(list)\n                    for L, f in zip(pref, flags):\n                        d[int(L)].append(int(f))\n                    # per-run mean per L\n                    run_map = {L: float(np.mean(v)) for L, v in d.items() if len(v) > 0}\n                    if len(run_map) > 0:\n                        per_run_maps.append(run_map)\n                        all_L.update(run_map.keys())\n            if len(per_run_maps) > 0:\n                xs_all = sorted(list(all_L))\n                xs_plot = downsample_xs(xs_all, max_points=5)\n                # Build matrix R x X with NaNs for missing\n                R = len(per_run_maps)\n                X = len(xs_plot)\n                M = np.full((R, X), np.nan, dtype=float)\n                for i, run_map in enumerate(per_run_maps):\n                    for j, L in enumerate(xs_plot):\n                        if L in run_map:\n                            M[i, j] = run_map[L]\n                mean_y = np.nanmean(M, axis=0)\n                err_y = sem(M, axis=0)\n                plt.figure()\n                plt.errorbar(\n                    xs_plot, mean_y, yerr=err_y, fmt=\"-o\", capsize=3, label=\"Mean \u00b1 SEM\"\n                )\n                plt.ylim(0.0, 1.0)\n                plt.xlabel(\"Prefix Length\")\n                plt.ylabel(\"Top-3 Accuracy\")\n                plt.title(\n                    f\"{ds_name} - Top-3 Accuracy vs Prefix Length\\nAggregated across runs | Next-activity\"\n                )\n                plt.legend()\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(\n                        working_dir, f\"{ds_name}_aggregated_top3_vs_prefixlen.png\"\n                    )\n                )\n                plt.close()\n        except Exception as e:\n            print(\n                f\"Error creating aggregated Top-3 vs Prefix Length for {ds_name}: {e}\"\n            )\n            plt.close()\n\n        # Aggregated confusion matrix (sum over runs)\n        try:\n            cm_sum = None\n            for r in runs:\n                y_true = r.get(\"ground_truth\", [])\n                y_pred = r.get(\"predictions\", [])\n                if len(y_true) > 0 and len(y_pred) > 0:\n                    # compute confusion for the labels present in this run\n                    labels = sorted(set(y_true) | set(y_pred))\n                    cm = confusion_matrix(y_true, y_pred, labels=labels)\n                    if cm_sum is None:\n                        cm_sum = cm\n                        cm_labels = labels\n                    else:\n                        # align labels\n                        all_labels = sorted(set(cm_labels) | set(labels))\n                        # expand cm_sum\n                        new_sum = np.zeros(\n                            (len(all_labels), len(all_labels)), dtype=int\n                        )\n                        # map old indices\n                        idx_old = {lab: i for i, lab in enumerate(cm_labels)}\n                        idx_new = {lab: i for i, lab in enumerate(all_labels)}\n                        for lab_i in cm_labels:\n                            for lab_j in cm_labels:\n                                new_sum[idx_new[lab_i], idx_new[lab_j]] += cm_sum[\n                                    idx_old[lab_i], idx_old[lab_j]\n                                ]\n                        # add current cm\n                        idx_cur = {lab: i for i, lab in enumerate(labels)}\n                        for lab_i in labels:\n                            for lab_j in labels:\n                                new_sum[idx_new[lab_i], idx_new[lab_j]] += cm[\n                                    idx_cur[lab_i], idx_cur[lab_j]\n                                ]\n                        cm_sum = new_sum\n                        cm_labels = all_labels\n            if cm_sum is not None:\n                plt.figure(figsize=(6, 5))\n                plt.imshow(cm_sum, aspect=\"auto\", cmap=\"Blues\")\n                plt.colorbar()\n                plt.title(\n                    f\"{ds_name} - Aggregated Confusion Matrix (Test)\\nAggregated across runs | Next-activity\"\n                )\n                plt.xlabel(\"Predicted\")\n                plt.ylabel(\"True\")\n                plt.tight_layout()\n                plt.savefig(\n                    os.path.join(\n                        working_dir, f\"{ds_name}_aggregated_confusion_matrix.png\"\n                    )\n                )\n                plt.close()\n        except Exception as e:\n            print(f\"Error creating aggregated confusion matrix for {ds_name}: {e}\")\n            plt.close()\n\n\nif __name__ == \"__main__\":\n    main()\n",
    "plot_analyses": [],
    "plot_paths": [],
    "vlm_feedback_summary": [],
    "exp_results_dir": "experiment_results/seed_aggregation_29473c6cb5124b67b0cb64d51dcc73c2",
    "exp_results_npy_files": []
  }
}