non_frontier_agents: &non_frontier_agents
  - GPT-4 0125
  - Claude 3 Opus
  - GPT-4 Turbo
  - o4-mini

figs:
  wrangle_logistic:
    headline: &wrangle_headline
      runs_file: &runs_file data/interim/filtered_runs.jsonl
      weighting: invsqrt_task_weight
      categories: ftr
      regularization: 0.1
      exclude: []
      success_percents: [50, 80]
      confidence_level: 0.95

    partial_scoring:
      <<: *wrangle_headline
      score_col: score_cont

    ga_rebench:
      <<: *wrangle_headline
      exclude: ["SWAA"]
      success_percents: [50]

    swe_bench:
      <<: *wrangle_headline
      runs_file: &swe_bench_runs_file data/external/swe_bench_runs.jsonl
      success_percents: [50]

  plot_logistic_regression:
    headline: &plot_headline
      <<: *wrangle_headline
      trendlines:
        - fit_type: exponential
          caption: null
          after_date: "2019-01-01"
          color: blue
          line_start_date: "2018-09-03"
          line_end_date: "2027-01-01"
          display_r_squared: true
          styling: null
          data_file: null
          skip_annotation: false
      include_task_distribution: none
      individual_labels: true
      x_lim_start: "2018-09-03"
      x_lim_end: "2027-01-01"
      lower_y_lim: 0.0083333 # 0.5 seconds
      upper_y_lim: 360
      linear_overrides:
        show_minor_xticks: false
        lower_y_lim: 0
        plot_style_overrides:
          scatter_styling:
            error_bar:
              alpha: 0.5
              capsize: 0
        hide_regression_info: true
      exclude_agents: *non_frontier_agents
      title: "Length of tasks AI agents have been able to complete autonomously"
      subtitle: "for 169 software engineering, cybersecurity, general reasoning, and ML tasks"
      rename_legend_labels:
        "davinci-002 (GPT-3)": "GPT 3"
        "gpt-3.5-turbo-instruct": "GPT 3.5"
        "GPT-4o": "GPT 4o"
        "Claude 3.7 Sonnet": "Sonnet 3.7"
        "Claude 3.5 Sonnet (New)": "Sonnet 3.6"
        "Claude 3.5 Sonnet (Old)": "Sonnet 3.5"
        "Claude 3 Opus": "Claude 3"
        "o1": "o1"
        "o1-preview": "o1 preview"
        "GPT-4 Turbo": "GPT 4 Turbo"
        "GPT-4 1106": "GPT 4 Nov '23"
        "GPT-4 0314": "GPT 4"
        "GPT-2": "GPT 2"
      show_y_label: true
    twitter_headline:
      <<: *plot_headline
      title: "The length of tasks AI can do is doubling every 7 months"
      subtitle: ""
      title_location: "left"
      legend_fontsize: 14
      ax_label_fontsize: 18
      title_fontsize: 20
      xlabel: "Model release date"
      ylabel: "Task length (at 50% success rate)"
      show_grid: true
      y_ticks_skip: 2
      hide_regression_info: true
      annotation_fontsize: 20
      legend_frameon: false
      xticks_skip: 2
      show_watermark: true
      show_example_tasks: true
      show_minor_xticks: true
      linear_overrides:
        show_minor_xticks: false
        hide_error_bars: false
        hide_trendline: true
        show_example_tasks: false
        lower_y_lim: 0
        y_ticks_skip: 1
        plot_style_overrides:
          scatter_styling:
            error_bar:
              alpha: 0.5
              capsize: 0
      plot_style_overrides:
        scatter_styling:
          scatter:
            s: 75
        agent_styling:
          "Claude 3.7 Sonnet":
            lab_color: &pink "#c86592"
            marker: "o"
          "Claude 3.5 Sonnet (New)":
            lab_color: *pink
            marker: "o"
          "Claude 3.5 Sonnet (Old)":
            lab_color: *pink
            marker: "o"
          "Claude 3 Opus":
            lab_color: *pink
            marker: "o"
          "o3":
            lab_color: *pink
            marker: "o"
          "o4-mini":
            lab_color: *pink
            marker: "o"
          "o1":
            lab_color: *pink
            marker: "o"
          "o1-preview":
            lab_color: *pink
            marker: "o"
          "GPT-4o":
            lab_color: *pink
            marker: "o"
          "GPT-4 Turbo":
            lab_color: *pink
            marker: "o"
          "GPT-4 0125":
            lab_color: *pink
            marker: "o"
          "GPT-4 1106":
            lab_color: *pink
            marker: "o"
          "GPT-4 0314":
            lab_color: *pink
            marker: "o"
          "gpt-3.5-turbo-instruct":
            lab_color: *pink
            marker: "o"
          "davinci-002 (GPT-3)":
            lab_color: *pink
            marker: "o"
          "GPT-2":
            lab_color: *pink
            marker: "o"
          "human":
            lab_color: "grey"
            marker: "o"
            unique_color: "#858585"
          "default":
            lab_color: "black"
            marker: "o"
            unique_color: "black"
      individual_labels: true
      show_y_label: false
    p80:
      <<: *plot_headline
      logistic_file: headline
      subtitle: 80% success rate
      success_percent: 80
      trendlines:
        - fit_type: exponential
          caption: null
          after_date: "2020-01-01"
          color: blue
          line_start_date: "2019-09-01"
          line_end_date: "2027-01-01"
          display_r_squared: true
          styling: null
          data_file: null
          skip_annotation: false
        - fit_type: exponential
          caption: null
          after_date: "2020-01-01"
          color: grey
          line_start_date: "2019-09-01"
          line_end_date: "2027-01-01"
          display_r_squared: true
          success_percent: 50
          data_file: null
          skip_annotation: true
          styling:
            linestyle: dashed
      exclude_agents: *non_frontier_agents

    single_line_2023_ga_rebench:
      runs_file: *runs_file
      logistic_file: ga_rebench
      trendlines:
        - fit_type: exponential
          skip_annotation: false
          caption: null
          after_date: "2023-01-01"
          color: blue
          line_start_date: "2023-01-01"
          line_end_date: "2025-05-01"
          display_r_squared: true
          data_file: null
          styling:
            linewidth: 2
            alpha: 0.5
            linestyle: dashed
      include_task_distribution: none
      weighting: invsqrt_task_weight
      x_lim_start: "2022-12-01"
      x_lim_end: "2025-05-01"
      lower_y_lim: 1
      upper_y_lim: 120
      exclude:
        - SWAA
      exclude_agents:
        - GPT-4 0125
        - Claude 3 Opus
        - GPT-4 Turbo
        - o4-mini
        - gpt-3.5-turbo-instruct
      # legend_fontsize: 8
      show_y_label: true

    double_line_all_data_retrodict_excluding_swaa:
      runs_file: *runs_file
      logistic_file: headline
      weighting: invsqrt_task_weight
      include_task_distribution: none
      title: "50% Time Horizon Retrodicted from 2023-2025 Data"

      trendlines:
        - fit_type: exponential
          skip_annotation: false
          caption: Fit on all data
          after_date: "2019-01-01"
          color: black
          line_start_date: "2018-09-03"
          line_end_date: "2025-07-01"
          display_r_squared: false
          data_file: null
          styling:
            linewidth: 2
            alpha: 0.6
            linestyle: solid
          exclude_agents: *non_frontier_agents
        - fit_type: exponential
          skip_annotation: false
          caption: |-
            Fit on non-SWAA tasks, 2023-2025 models
          after_date: "2023-01-01"
          color: blue
          line_start_date: "2018-09-03"
          line_end_date: "2025-07-01"
          display_r_squared: false
          data_file: data/wrangled/logistic_fits/ga_rebench.csv
          styling:
            linewidth: 2
            alpha: 0.6
            linestyle: solid
      exclude: []
      exclude_agents: *non_frontier_agents
      lower_y_lim: 0.0083333 # 0.5 seconds
      upper_y_lim: 240
      x_lim_start: "2018-09-03"
      x_lim_end: "2025-11-06"
      show_y_label: true

    double_line_2024_trendline:
      <<: *plot_headline
      logistic_file: headline
      title: "2019-2025 and 2024-2025 Trendlines in 50% Time Horizon"
      trendlines:
        - fit_type: exponential
          caption: null
          after_date: "2019-01-01"
          color: blue
          line_end_date: "2027-01-01"
          data_file: null
        - fit_type: exponential
          caption: null
          after_date: "2024-01-01"
          color: red
          line_end_date: "2027-01-01"
          data_file: null

    swe_bench:
      runs_file: *swe_bench_runs_file
      logistic_file: swe_bench
      weighting: invsqrt_task_weight
      include_task_distribution: none
      title: "Time Horizon for SWE-Bench Verified Tasks"
      trendlines:
        - fit_type: exponential
          skip_annotation: false
          caption: Fit on SWE-Bench tasks
          after_date: "2023-01-01"
          color: blue
          line_start_date: "2023-01-01"
          line_end_date: "2025-03-14"
          display_r_squared: true
          display_after_date: false
          data_file: null
          styling: null
        - fit_type: exponential
          skip_annotation: false
          caption: "Fit on our tasks\nModels ≥ GPT-4 1106"
          after_date: "2023-11-05"
          color: black
          line_start_date: "2018-09-03"
          line_end_date: "2025-03-14"
          display_r_squared: false
          display_after_date: false
          data_file: data/wrangled/logistic_fits/headline.csv
          styling:
            linewidth: 1.5
            linestyle: dashed
          exclude_agents: *non_frontier_agents
      x_lim_start: "2023-01-01"
      x_lim_end: "2025-03-14"
      lower_y_lim: 0.05
      upper_y_lim: 240
      exclude_agents: []
      exclude: []
      show_y_label: true
    partial_scoring:
      <<: *plot_headline
      logistic_file: partial_scoring
      subtitle: "Continuous Scoring"

    all_models:
      <<: *plot_headline
      logistic_file: headline
      exclude_agents: []

  plot_horizon_alternative_fits:
    alternative_fits:
      <<: *plot_headline
      logistic_file: headline
      trendlines:
        - fit_type: exponential
          skip_annotation: false
          caption: Exponential Fit
          after_date: "2019-01-01"
          color: blue
          line_end_date: "2027-01-01"
          display_r_squared: true
        - fit_type: linear
          after_date: "2019-01-01"
          line_end_date: "2027-01-01"
          caption: Linear Fit
          display_r_squared: true
          color: red
        - fit_type: hyperbolic
          after_date: "2019-01-01"
          display_r_squared: true
          color: green
          caption: Hyperbolic Fit
          line_end_date: "2027-01-01"

  plot_individual_histograms:
    no_swaa:
      title: "Success Rates vs Task Length for HCAST + RE-Bench Tasks"
      annotate_p50: true
      logistic_file: "data/wrangled/logistic_fits/ga_rebench.csv"
      weighting: "invsqrt_task_weight"
      n_subplot_cols: 3
      horizontal_lines:
        - p_success: 0.5
          styling:
            color: "#b30c00"
            linestyle: "dashed"
            linewidth: 2.5
            alpha: 1
      x_lim_start: "2022-12-01"
      x_lim_end: "2025-04-01"
      lower_y_lim: 0
      upper_y_lim: 1
      exclude:
        - "SWAA"
      include_agents:
        - Claude 3.7 Sonnet
        - Claude 3.5 Sonnet (New)
        - Claude 3.5 Sonnet (Old)
        - Claude 3 Opus
        - o1
        - o1-preview
        - GPT-4o
        - GPT-4 Turbo
        - GPT-4 1106
        - GPT-4 0314
        # - gpt-3.5-turbo-instruct
        # - davinci-002 (GPT-3)
        # - GPT-2

    default: &default_histogram
      annotate_p50: true
      logistic_file: "data/wrangled/logistic_fits/headline.csv"
      weighting: "invsqrt_task_weight"
      title: "Length of tasks AI agents have been able to complete autonomously"
      n_subplot_cols: 5
      horizontal_lines:
        - p_success: 0.5
          styling:
            color: "firebrick"
            linestyle: "dashed"
            linewidth: 1.8
            alpha: 0.7
      x_lim_start: "2022-12-01"
      x_lim_end: "2025-04-01"
      lower_y_lim: 0
      upper_y_lim: 1
      exclude: []
      include_agents: &default_agents
        - Claude 3.7 Sonnet
        - Claude 3.5 Sonnet (New)
        - Claude 3.5 Sonnet (Old)
        - Claude 3 Opus
        - o3
        - o4-mini
        - o1
        - o1-preview
        - GPT-4o
        - GPT-4 Turbo
        - GPT-4 1106
        - GPT-4 0314
        - gpt-3.5-turbo-instruct
        - davinci-002 (GPT-3)
        - GPT-2
    human_baselines:
      <<: *default_histogram
      n_subplot_cols: 1
      title: "Human baseliner performance on tasks"
      include_agents:
        - human

    overlaid:
      <<: *default_histogram
      n_subplot_cols: 1
      title: "Models are succeeding at increasingly long tasks"
      include_agents:
        - Claude 3.7 Sonnet
        - o1-preview
        - GPT-4 0314
        - gpt-3.5-turbo-instruct
        - davinci-002 (GPT-3)
        - GPT-2
      annotate_p50: false
      type: "overlaid"
    c_37:
      <<: *default_histogram
      n_subplot_cols: 1
      title: ""
      width: 7
      include_agents:
        - Claude 3.7 Sonnet
    claudes:
      <<: *default_histogram
      n_subplot_cols: 1
      title: ""
      width: 7
      include_agents:
        - Claude 3.7 Sonnet
        - Claude 3.5 Sonnet (New)
        - Claude 3.5 Sonnet (Old)
        - Claude 3 Opus
    stacked:
      <<: *default_histogram
      n_subplot_cols: 1
      title: ""
      width: 7
      include_agents:
        - Claude 3.7 Sonnet
        - GPT-4 0314
        - GPT-2

  plot_multiverse_boxplot:
    weightings: ["equal_task_weight", "invsqrt_task_weight"]
    regularizations: [0.2, 0.1, 0.05, 0.02, 0.01]
    categories: "ftr"
    include_agents: *default_agents
    agents_2024:
      - Claude 3.7 Sonnet
      - Claude 3.5 Sonnet (New)
      - Claude 3.5 Sonnet (Old)
      - o1-preview
      - o1
      - o3
      - GPT-4o

    n_bootstrap: 1000

  plot_cost:
    include_agents: *default_agents

  generate_model_task_table:
    include_agents: *default_agents

  plot_bar_chart_weighted_scores:
    focus_agents: *default_agents
    weighting: "invsqrt_task_weight"
    exclude: []

  plot_task_distribution:
    runs_file: *runs_file
    weight_key: equal_task_weight
