{
  "grader": {
    "type": "python",
    "name": "katago_pv_reward",
    "source": "\nimport json\nimport math\nimport re\n\nTOP_MOVE_SCORES = {\n    1: 1.00,\n    2: 0.85,\n    3: 0.70,\n    4: 0.50,\n    5: 0.50,\n}\n\nMOVE_RE = re.compile(r\"^(?:PASS|[A-HJ-T](?:[1-9]|1[0-9]))$\")\n\n\ndef _as_dict(sample):\n    if isinstance(sample, dict):\n        if isinstance(sample.get(\"output_json\"), dict):\n            return sample[\"output_json\"]\n        if isinstance(sample.get(\"json\"), dict):\n            return sample[\"json\"]\n        for key in (\"output_text\", \"text\", \"content\"):\n            value = sample.get(key)\n            if isinstance(value, str):\n                try:\n                    parsed = json.loads(value)\n                    if isinstance(parsed, dict):\n                        return parsed\n                except Exception:\n                    pass\n        return sample\n    if isinstance(sample, str):\n        try:\n            parsed = json.loads(sample)\n            if isinstance(parsed, dict):\n                return parsed\n        except Exception:\n            pass\n    return {}\n\n\ndef _valid_move(move):\n    return isinstance(move, str) and MOVE_RE.fullmatch(move.upper()) is not None\n\n\ndef _valid_pv(pv):\n    return isinstance(pv, list) and all(_valid_move(move) for move in pv)\n\n\ndef move_reward(pred_move, ref_top_moves):\n    try:\n        rank = ref_top_moves.index(pred_move) + 1\n        return TOP_MOVE_SCORES.get(rank, 0.0)\n    except ValueError:\n        return 0.0\n\n\ndef pv_reward(pred_pv, ref_pv, alpha=0.85, max_len=12):\n    T = min(len(pred_pv), len(ref_pv), max_len)\n    if T == 0:\n        return 0.0\n    num = 0.0\n    den = 0.0\n    for t in range(T):\n        w = alpha ** t\n        den += w\n        if pred_pv[t] == ref_pv[t]:\n            num += w\n        else:\n            break\n    return num / den if den > 0 else 0.0\n\n\ndef bounded_linear_reward(error, tol):\n    return max(0.0, 1.0 - error / tol)\n\n\ndef grade(sample, item):\n    \"\"\"\n    sample: model output dict or wrapper containing output_json/output_text\n    item: reference dict with precomputed KataGo values\n    \"\"\"\n    sample = _as_dict(sample)\n    try:\n        explanation = str(sample[\"explanation\"])\n        pred_move = str(sample[\"best_move\"]).upper()\n        pred_pv = [str(move).upper() for move in sample[\"pv_top1\"]]\n        pred_wr = float(sample[\"winrate_black\"])\n        pred_score = float(sample[\"score_lead_black\"])\n    except Exception:\n        return 0.0\n\n    ref_top_moves = [str(move).upper() for move in item[\"reference\"][\"top_moves\"]]\n    ref_pv = [str(move).upper() for move in item[\"reference\"][\"pv_top1\"]]\n    ref_wr = float(item[\"reference\"][\"winrate_black\"])\n    ref_score = float(item[\"reference\"][\"score_lead_black\"])\n\n    if not _valid_move(pred_move) or not _valid_pv(pred_pv):\n        return 0.0\n\n    explanation_words = len(re.findall(r\"\\S+\", explanation))\n    if explanation_words == 0:\n        return 0.0\n    r_format = 1.0 if explanation_words <= 150 else 0.5\n\n    r_move = move_reward(pred_move, ref_top_moves)\n    r_pv = pv_reward(pred_pv, ref_pv)\n    r_wr = bounded_linear_reward(abs(pred_wr - ref_wr), tol=15.0)\n    r_score = bounded_linear_reward(abs(pred_score - ref_score), tol=10.0)\n\n    reward = (\n        0.35 * r_move +\n        0.30 * r_pv +\n        0.15 * r_wr +\n        0.15 * r_score +\n        0.05 * r_format\n    )\n    return float(max(0.0, min(1.0, reward)))\n"
  },
  "response_format": {
    "type": "json_schema",
    "json_schema": {
      "name": "katago_pv_prediction",
      "strict": true,
      "schema": {
        "type": "object",
        "properties": {
          "best_move": {
            "type": "string",
            "description": "Predicted best move in GTP coordinate form, e.g. R15, or PASS."
          },
          "explanation": {
            "type": "string",
            "description": "Brief explanation, max 150 words, of key position features and why the best move is correct."
          },
          "pv_top1": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "minItems": 1,
            "maxItems": 12,
            "description": "Predicted principal variation for the best move, up to 12 plies."
          },
          "winrate_black": {
            "type": "number",
            "description": "Black winrate as a percentage from 0 to 100."
          },
          "score_lead_black": {
            "type": "number",
            "description": "KataGo score lead from Black's perspective, in points."
          }
        },
        "required": [
          "explanation",
          "best_move",
          "pv_top1",
          "winrate_black",
          "score_lead_black"
        ],
        "additionalProperties": false
      }
    }
  },
  "model": "o4-mini-2025-04-16",
  "train_file_path": "openairft/rft_katago/test_job/katago_rft_train_8.jsonl",
  "validation_file_path": "openairft/rft_katago/test_job/katago_rft_validation_2.jsonl",
  "uploaded_training_file": {
    "object": "file",
    "id": "file-W6cR5YH46WBN9XhsST5J1G",
    "purpose": "fine-tune",
    "filename": "katago_rft_train_8.jsonl",
    "bytes": 14333,
    "created_at": 1777909952,
    "expires_at": null,
    "status": "processed",
    "status_details": null
  },
  "uploaded_validation_file": {
    "object": "file",
    "id": "file-6WuybxDHR54qHfmgzbtgKm",
    "purpose": "fine-tune",
    "filename": "katago_rft_validation_2.jsonl",
    "bytes": 3531,
    "created_at": 1777909952,
    "expires_at": null,
    "status": "processed",
    "status_details": null
  },
  "job_request": {
    "training_file": "file-W6cR5YH46WBN9XhsST5J1G",
    "validation_file": "file-6WuybxDHR54qHfmgzbtgKm",
    "model": "o4-mini-2025-04-16",
    "suffix": "katago-pv-rft-test",
    "method": {
      "type": "reinforcement",
      "reinforcement": {
        "grader": {
          "type": "python",
          "name": "katago_pv_reward",
          "source": "\nimport json\nimport math\nimport re\n\nTOP_MOVE_SCORES = {\n    1: 1.00,\n    2: 0.85,\n    3: 0.70,\n    4: 0.50,\n    5: 0.50,\n}\n\nMOVE_RE = re.compile(r\"^(?:PASS|[A-HJ-T](?:[1-9]|1[0-9]))$\")\n\n\ndef _as_dict(sample):\n    if isinstance(sample, dict):\n        if isinstance(sample.get(\"output_json\"), dict):\n            return sample[\"output_json\"]\n        if isinstance(sample.get(\"json\"), dict):\n            return sample[\"json\"]\n        for key in (\"output_text\", \"text\", \"content\"):\n            value = sample.get(key)\n            if isinstance(value, str):\n                try:\n                    parsed = json.loads(value)\n                    if isinstance(parsed, dict):\n                        return parsed\n                except Exception:\n                    pass\n        return sample\n    if isinstance(sample, str):\n        try:\n            parsed = json.loads(sample)\n            if isinstance(parsed, dict):\n                return parsed\n        except Exception:\n            pass\n    return {}\n\n\ndef _valid_move(move):\n    return isinstance(move, str) and MOVE_RE.fullmatch(move.upper()) is not None\n\n\ndef _valid_pv(pv):\n    return isinstance(pv, list) and all(_valid_move(move) for move in pv)\n\n\ndef move_reward(pred_move, ref_top_moves):\n    try:\n        rank = ref_top_moves.index(pred_move) + 1\n        return TOP_MOVE_SCORES.get(rank, 0.0)\n    except ValueError:\n        return 0.0\n\n\ndef pv_reward(pred_pv, ref_pv, alpha=0.85, max_len=12):\n    T = min(len(pred_pv), len(ref_pv), max_len)\n    if T == 0:\n        return 0.0\n    num = 0.0\n    den = 0.0\n    for t in range(T):\n        w = alpha ** t\n        den += w\n        if pred_pv[t] == ref_pv[t]:\n            num += w\n        else:\n            break\n    return num / den if den > 0 else 0.0\n\n\ndef bounded_linear_reward(error, tol):\n    return max(0.0, 1.0 - error / tol)\n\n\ndef grade(sample, item):\n    \"\"\"\n    sample: model output dict or wrapper containing output_json/output_text\n    item: reference dict with precomputed KataGo values\n    \"\"\"\n    sample = _as_dict(sample)\n    try:\n        explanation = str(sample[\"explanation\"])\n        pred_move = str(sample[\"best_move\"]).upper()\n        pred_pv = [str(move).upper() for move in sample[\"pv_top1\"]]\n        pred_wr = float(sample[\"winrate_black\"])\n        pred_score = float(sample[\"score_lead_black\"])\n    except Exception:\n        return 0.0\n\n    ref_top_moves = [str(move).upper() for move in item[\"reference\"][\"top_moves\"]]\n    ref_pv = [str(move).upper() for move in item[\"reference\"][\"pv_top1\"]]\n    ref_wr = float(item[\"reference\"][\"winrate_black\"])\n    ref_score = float(item[\"reference\"][\"score_lead_black\"])\n\n    if not _valid_move(pred_move) or not _valid_pv(pred_pv):\n        return 0.0\n\n    explanation_words = len(re.findall(r\"\\S+\", explanation))\n    if explanation_words == 0:\n        return 0.0\n    r_format = 1.0 if explanation_words <= 150 else 0.5\n\n    r_move = move_reward(pred_move, ref_top_moves)\n    r_pv = pv_reward(pred_pv, ref_pv)\n    r_wr = bounded_linear_reward(abs(pred_wr - ref_wr), tol=15.0)\n    r_score = bounded_linear_reward(abs(pred_score - ref_score), tol=10.0)\n\n    reward = (\n        0.35 * r_move +\n        0.30 * r_pv +\n        0.15 * r_wr +\n        0.15 * r_score +\n        0.05 * r_format\n    )\n    return float(max(0.0, min(1.0, reward)))\n"
        },
        "response_format": {
          "type": "json_schema",
          "json_schema": {
            "name": "katago_pv_prediction",
            "strict": true,
            "schema": {
              "type": "object",
              "properties": {
                "best_move": {
                  "type": "string",
                  "description": "Predicted best move in GTP coordinate form, e.g. R15, or PASS."
                },
                "explanation": {
                  "type": "string",
                  "description": "Brief explanation, max 150 words, of key position features and why the best move is correct."
                },
                "pv_top1": {
                  "type": "array",
                  "items": {
                    "type": "string"
                  },
                  "minItems": 1,
                  "maxItems": 12,
                  "description": "Predicted principal variation for the best move, up to 12 plies."
                },
                "winrate_black": {
                  "type": "number",
                  "description": "Black winrate as a percentage from 0 to 100."
                },
                "score_lead_black": {
                  "type": "number",
                  "description": "KataGo score lead from Black's perspective, in points."
                }
              },
              "required": [
                "explanation",
                "best_move",
                "pv_top1",
                "winrate_black",
                "score_lead_black"
              ],
              "additionalProperties": false
            }
          }
        }
      }
    }
  },
  "job": {
    "object": "fine_tuning.job",
    "id": "ftjob-07Pk4LJb0W0F0rRlbCZazt6t",
    "model": "o4-mini-2025-04-16",
    "created_at": 1777909956,
    "finished_at": null,
    "fine_tuned_model": null,
    "organization_id": "org-5jkcIKQaT9aUC17z8rJqyZnB",
    "result_files": [],
    "status": "validating_files",
    "validation_file": "file-6WuybxDHR54qHfmgzbtgKm",
    "training_file": "file-W6cR5YH46WBN9XhsST5J1G",
    "trained_tokens": null,
    "error": {},
    "user_provided_suffix": "katago-pv-rft-test",
    "seed": 834852430,
    "estimated_finish": null,
    "integrations": [],
    "method": {
      "type": "reinforcement",
      "reinforcement": {
        "hyperparameters": {
          "batch_size": "auto",
          "learning_rate_multiplier": "auto",
          "n_epochs": "auto",
          "eval_interval": "auto",
          "eval_samples": "auto",
          "compute_multiplier": "auto",
          "reasoning_effort": "default"
        },
        "grader": {
          "type": "python",
          "name": "katago_pv_reward",
          "source": "\nimport json\nimport math\nimport re\n\nTOP_MOVE_SCORES = {\n    1: 1.00,\n    2: 0.85,\n    3: 0.70,\n    4: 0.50,\n    5: 0.50,\n}\n\nMOVE_RE = re.compile(r\"^(?:PASS|[A-HJ-T](?:[1-9]|1[0-9]))$\")\n\n\ndef _as_dict(sample):\n    if isinstance(sample, dict):\n        if isinstance(sample.get(\"output_json\"), dict):\n            return sample[\"output_json\"]\n        if isinstance(sample.get(\"json\"), dict):\n            return sample[\"json\"]\n        for key in (\"output_text\", \"text\", \"content\"):\n            value = sample.get(key)\n            if isinstance(value, str):\n                try:\n                    parsed = json.loads(value)\n                    if isinstance(parsed, dict):\n                        return parsed\n                except Exception:\n                    pass\n        return sample\n    if isinstance(sample, str):\n        try:\n            parsed = json.loads(sample)\n            if isinstance(parsed, dict):\n                return parsed\n        except Exception:\n            pass\n    return {}\n\n\ndef _valid_move(move):\n    return isinstance(move, str) and MOVE_RE.fullmatch(move.upper()) is not None\n\n\ndef _valid_pv(pv):\n    return isinstance(pv, list) and all(_valid_move(move) for move in pv)\n\n\ndef move_reward(pred_move, ref_top_moves):\n    try:\n        rank = ref_top_moves.index(pred_move) + 1\n        return TOP_MOVE_SCORES.get(rank, 0.0)\n    except ValueError:\n        return 0.0\n\n\ndef pv_reward(pred_pv, ref_pv, alpha=0.85, max_len=12):\n    T = min(len(pred_pv), len(ref_pv), max_len)\n    if T == 0:\n        return 0.0\n    num = 0.0\n    den = 0.0\n    for t in range(T):\n        w = alpha ** t\n        den += w\n        if pred_pv[t] == ref_pv[t]:\n            num += w\n        else:\n            break\n    return num / den if den > 0 else 0.0\n\n\ndef bounded_linear_reward(error, tol):\n    return max(0.0, 1.0 - error / tol)\n\n\ndef grade(sample, item):\n    \"\"\"\n    sample: model output dict or wrapper containing output_json/output_text\n    item: reference dict with precomputed KataGo values\n    \"\"\"\n    sample = _as_dict(sample)\n    try:\n        explanation = str(sample[\"explanation\"])\n        pred_move = str(sample[\"best_move\"]).upper()\n        pred_pv = [str(move).upper() for move in sample[\"pv_top1\"]]\n        pred_wr = float(sample[\"winrate_black\"])\n        pred_score = float(sample[\"score_lead_black\"])\n    except Exception:\n        return 0.0\n\n    ref_top_moves = [str(move).upper() for move in item[\"reference\"][\"top_moves\"]]\n    ref_pv = [str(move).upper() for move in item[\"reference\"][\"pv_top1\"]]\n    ref_wr = float(item[\"reference\"][\"winrate_black\"])\n    ref_score = float(item[\"reference\"][\"score_lead_black\"])\n\n    if not _valid_move(pred_move) or not _valid_pv(pred_pv):\n        return 0.0\n\n    explanation_words = len(re.findall(r\"\\S+\", explanation))\n    if explanation_words == 0:\n        return 0.0\n    r_format = 1.0 if explanation_words <= 150 else 0.5\n\n    r_move = move_reward(pred_move, ref_top_moves)\n    r_pv = pv_reward(pred_pv, ref_pv)\n    r_wr = bounded_linear_reward(abs(pred_wr - ref_wr), tol=15.0)\n    r_score = bounded_linear_reward(abs(pred_score - ref_score), tol=10.0)\n\n    reward = (\n        0.35 * r_move +\n        0.30 * r_pv +\n        0.15 * r_wr +\n        0.15 * r_score +\n        0.05 * r_format\n    )\n    return float(max(0.0, min(1.0, reward)))\n"
        },
        "response_format": {
          "type": "json_schema",
          "json_schema": {
            "name": "katago_pv_prediction",
            "strict": true,
            "schema": {
              "type": "object",
              "properties": {
                "best_move": {
                  "type": "string",
                  "description": "Predicted best move in GTP coordinate form, e.g. R15, or PASS."
                },
                "explanation": {
                  "type": "string",
                  "description": "Brief explanation, max 150 words, of key position features and why the best move is correct."
                },
                "pv_top1": {
                  "type": "array",
                  "items": {
                    "type": "string"
                  },
                  "minItems": 1,
                  "maxItems": 12,
                  "description": "Predicted principal variation for the best move, up to 12 plies."
                },
                "winrate_black": {
                  "type": "number",
                  "description": "Black winrate as a percentage from 0 to 100."
                },
                "score_lead_black": {
                  "type": "number",
                  "description": "KataGo score lead from Black's perspective, in points."
                }
              },
              "required": [
                "explanation",
                "best_move",
                "pv_top1",
                "winrate_black",
                "score_lead_black"
              ],
              "additionalProperties": false
            }
          }
        }
      }
    },
    "metadata": null,
    "usage_metrics": null,
    "shared_with_openai": false,
    "eval_id": null,
    "internal_worker_backend": null,
    "internal_peashooter_execution": null,
    "train_experiment_id": null,
    "eval_experiment_id": null
  }
}