import os

# --- Dataset Metadata ---
# Defines the scale and target engine for each dataset
DATASET_REGISTRY = {
    'Corpus': {'size': 13597,  'engine': 'Python 3.8.10'},
    'PyPI':   {'size': 118099, 'engine': 'Python 3.8.10'},
    'Maven':  {'size': 136643, 'engine': 'Java 1.8.0_432'},
    'NuGet':  {'size': 49366,  'engine': '.NET 7.0'},
    'AST':    {'size': 1000,   'engine': 'Test Environment'}
}

# --- Node Type Mapping ---
# Maps string operators to integer IDs for attribute encoding
NODE_TYPE_MAP = {
    "ROOT": 0, "EPSILON": 1, "CHARCLASS": 2, "CONCAT": 3, "UNION": 4,
    "STAR": 5, "PLUS": 6, "OPT": 7, "REPEAT": 8,          # Counting Ops
    "CAPTURELEFT": 9, "CAPTURERIGHT": 10,                 # Grouping
    "PLOOKAHEAD": 11, "NLOOKAHEAD": 12,                   # Lookarounds
    "PLOOKBEHIND": 13, "NLOOKBEHIND": 14,
    "BACKREFERENCE": 15                                   # Extended
}
NUM_NODE_TYPES = len(NODE_TYPE_MAP)

# --- Motif Logic Definitions ---
# Operator sets used to identify motif centers and contexts
CNT_OPS = {5, 6, 7, 8}       # Counting operators
UNI_OPS = {4, 2}             # Union/Choice operators
EXT_OPS = {11, 12, 13, 14, 15} # Extended features (Lookaround/Backref)

MOTIF_VOCAB_SIZE = 1000      # Size for Feature Hashing