# Dataset configurations for OFMU experiments

datasets:

  tofu:
    base_repo: "locuslab/TOFU"
    scenarios:
      forget01:
        repo_id: "locuslab/TOFU"
        filename: "forget01.json"
        forget_ratio: 0.01
      forget05:
        repo_id: "locuslab/TOFU"
        filename: "forget05.json"
        forget_ratio: 0.05
      forget10:
        repo_id: "locuslab/TOFU"
        filename: "forget10.json"
        forget_ratio: 0.10
    eval_data:
      repo_id: "locuslab/TOFU"
      eval_files:
        - "eval_full.json"
        - "eval_forget.json"
        - "eval_retain.json"
        - "eval_real_authors.json"
        - "eval_world_facts.json"

  wmdp:
    base_repo: "cais/wmdp"
    domains:
      bio:
        repo_id: "cais/wmdp"
        filename: "wmdp-bio.jsonl"
        description: "Biosecurity questions"
      cyber:
        repo_id: "cais/wmdp"
        filename: "wmdp-cyber.jsonl"
        description: "Cybersecurity questions"
      chem:
        repo_id: "cais/wmdp"
        filename: "wmdp-chem.jsonl"
        description: "Chemical security questions"
    corpora:
      bio_corpora:
        repo_id: "cais/wmdp-corpora"
        filename: "bio-forget-corpus.jsonl"
      cyber_corpora:
        repo_id: "cais/wmdp-corpora"
        filename: "cyber-forget-corpus.jsonl"
      chem_corpora:
        repo_id: "cais/wmdp-corpora"
        filename: "chem-forget-corpus.jsonl"


  # Vision datasets
  cifar10:
    source: "torchvision"
    dataset_name: "CIFAR10"
    num_classes: 10
    train_size: 50000
    test_size: 10000
    image_size: [32, 32, 3]
    auto_download: true

  cifar100:
    source: "torchvision"
    dataset_name: "CIFAR100"
    num_classes: 100
    train_size: 50000
    test_size: 10000
    image_size: [32, 32, 3]
    auto_download: true

  imagenet:
    source: "huggingface"
    repo_id: "imagenet-1k"
    num_classes: 1000
    train_size: 1281167
    val_size: 50000
    image_size: [224, 224, 3]
    streaming: true  # Use streaming for large dataset

# Data preprocessing configurations
preprocessing:
  text:
    max_length: 2048
    truncation: true
    padding: "max_length"
    return_tensors: "pt"
    add_special_tokens: true

  vision:
    normalize:
      mean: [0.485, 0.456, 0.406]
      std: [0.229, 0.224, 0.225]
    resize: [224, 224]
    random_crop: [224, 224]
    random_horizontal_flip: 0.5
    to_tensor: true

# Evaluation configurations
evaluation:
  metrics:
    tofu:
      - "forget_quality"
      - "model_utility"
      - "truth_ratio" 
      - "rouge_score"
      - "bleu_score"
    muse:
      - "forget_quality"
      - "neighbor_acc"
      - "truthfulness"
      - "factuality"
      - "fluency"
    wmdp:
      - "qa_accuracy"
      - "multiple_choice_accuracy"
      - "safety_score"
      - "retain_performance"
    vision:
      - "unlearn_accuracy"
      - "retain_accuracy"
      - "test_accuracy"
      - "mia_auc"

  batch_sizes:
    evaluation: 16
    generation: 8
    mia_evaluation: 32

# Data paths
paths:
  data_root: "./data"
  cache_dir: "./cache"
  results_dir: "./results"
  logs_dir: "./logs"
  models_dir: "./models"