language:
  code: zh

logging:
  log_dir: logs

llm:
  provider: deepseek
  temperature: 0.1
  top_p: 0.5
  default_api_key: "sk-17a236632dd6433c8dda23a460fb7699"
  proxy: socks5h://127.0.0.1:8101
  deepseek:
    model: deepseek-chat
    max_tokens: 4096
  openrouter:
    model: openai/gpt-4o-mini
    max_tokens: 2048
  openai:
    model: gpt-4o-mini
    max_tokens: 2048
  groq:
    model: mixtral-8x7b-32768
  ollama:
    model: bge-m3
    base_url: http://0.0.0.0:11434

openrouter:
  api_key_env: "OPENROUTER_API_KEY"
  api_key: ""
  base_url: "https://openrouter.ai/api/v1"
  timeout: 3600
  extra_headers:
    referer: ""
    title: ""

dataset:
  name: &dataset_name SCOPE
ontology:
  languages:
    zh:
      entities:
        - 人物: "人物姓名，避免形容词"
        - 物体: "有形物体，名称中不含冠词"
        - 事件: "事件或情节，保持中立描述"
        - 地点
        - 组织
        - 文档
        - 行动: "关键动作或行为"
        - 其它: "其它重要概念"
      relationships:
        - head_entity: 人物
          tail_entity: 组织
          rel_type: 隶属于
          description: "角色在某个组织或阵营中任职、受训或接受指挥"
        - head_entity: 组织
          tail_entity: 地点
          rel_type: 位于
          description: "组织的总部、设施或主要部署区域"
        - head_entity: 组织
          tail_entity: 组织
          rel_type: 对抗
          description: "两个组织在战略、舆论或资源层面发生直接冲突"
        - head_entity: 人物
          tail_entity: 物体
          rel_type: 负责设计
          description: "个人主导或负责关键系统、装备或技术方案的设计"
        - head_entity: 人物
          tail_entity: 行动
          rel_type: 负责推进
          description: "个人主导执行或推动一项行动、计划或改革"
        - head_entity: 物体
          tail_entity: 地点
          rel_type: 部署于
          description: "系统、装备或技术在特定地理区域或设施中的部署"
        - head_entity: 物体
          tail_entity: 物体
          rel_type: 包含组件
          description: "上层系统所包含的子模块、传感器或软件组件"
        - head_entity: 物体
          tail_entity: 事件
          rel_type: 引发事件
          description: "系统或技术变化导致的事故、舆情或调查"
        - head_entity: 人物
          tail_entity: 文档
          rel_type: 报告问题
          description: "个人通过备忘录、报告或公告形式提出质疑与反馈"
        - head_entity: 人物
          tail_entity: 事件
          rel_type: 参与评估
          description: "个人以专家、负责人或调查成员身份参与事件评估"
    en:
      entities:
        - Person: "Person name without adjectives"
        - Object: "Tangible object name without articles"
        - Event: "Neutral description of a key event"
        - Place
        - Organisation
        - Document
        - Action: "Critical action or initiative"
        - Misc: "Other important concepts"
      relationships:
        - head_entity: Person
          tail_entity: Organisation
          rel_type: member_of
          description: "A character serves, trains or operates within an organisation"
        - head_entity: Organisation
          tail_entity: Place
          rel_type: located_in
          description: "Headquarters, facilities or deployment area of an organisation"
        - head_entity: Organisation
          tail_entity: Organisation
          rel_type: conflicts_with
          description: "Two organisations engage in strategic or resource-level confrontation"
        - head_entity: Person
          tail_entity: Object
          rel_type: designs
          description: "An individual leads the design of a key system, equipment or technology"
        - head_entity: Person
          tail_entity: Action
          rel_type: leads
          description: "An individual drives or executes a plan, action or reform"
        - head_entity: Object
          tail_entity: Place
          rel_type: deployed_in
          description: "Where a system, device or technology is deployed"
        - head_entity: Object
          tail_entity: Object
          rel_type: contains_component
          description: "How upper systems contain submodules, sensors or software components"
        - head_entity: Object
          tail_entity: Event
          rel_type: triggers
          description: "System changes that cause incidents, public opinion or investigations"
        - head_entity: Person
          tail_entity: Document
          rel_type: reports_issue
          description: "Individuals raise concerns via memos, reports or announcements"
        - head_entity: Person
          tail_entity: Event
          rel_type: evaluates
          description: "Individuals participate in event assessments as experts or leaders"
  output_sections:
    # - all
    - entities
    - relationships
    - events
prompts:
  limits:
    entity_types:
      min: 10
      max: 50
    relationship_types:
      min: 50
      max: 150
    event_types:
      min: 10
      max: 50
  templates:
    ontology:
      zh:
        system: |
          你是一名资深本体工程师，负责根据输入背景语料设计知识图谱本体。{entity_range_sentence}{relationship_range_sentence}最终只返回 JSON。{language_instruction}
        user: |
          请参考以下背景语料，并以上述提示为灵感，生成最贴近内容的知识图谱本体。
          - 允许微调实体类型或新增更贴近场景的实体描述。
          - 关系需覆盖主要角色/事件之间的因果、隶属或互动。
          - 严禁输出具体角色/组织名称，只描述抽象的实体类型（可附简短解释）。
          - relationships 数组中的每一项必须包含 head_entity、tail_entity、rel_type 字段，可选填 description。
          - 输出 JSON，字段只包含 entities 与 relationships。

          【背景摘录】
          {background_text}

          【可参考的实体提示】
          {entity_hint}

          【可参考的关系提示】
          {relation_hint}

          示例输出格式：
          {ontology_sample_json}
          {language_instruction}
      en:
        system: |
          You are a senior ontology engineer who must design a schema from the provided background text. {entity_range_sentence}{relationship_range_sentence}Respond with JSON only. {language_instruction}
        user: |
          Use the following background excerpt and hints to craft an ontology.
          - You may tweak entity types or add better aligned descriptions.
          - Relationships should cover causality, affiliation or interaction between major roles/events.
          - NEVER output concrete names of roles/organisations; only abstract entity types with short notes.
          - Each item in the relationships array must include head_entity, tail_entity and rel_type, with optional description.
          - Output JSON with only 'entities' and 'relationships'.

          [Background Excerpt]
          {background_text}

          [Entity Hints]
          {entity_hint}

          [Relationship Hints]
          {relation_hint}

          Sample output:
          {ontology_sample_json}
          {language_instruction}
    events:
      zh:
        system: |
          你是事件抽取专家，需为知识图谱设计事件类型与论元。{event_range_sentence}输出包含可复用的 event_type、触发词和论元。最终只返回 JSON，仅保留 events 数组。{language_instruction}
        user: |
          请基于以下背景语料，总结最重要的事件类型。
          {event_limit_instruction}
          - 每个事件需包含触发词 trigger_words（数组）与 arguments（论元列表）。
          - 论元至少覆盖发起方、受影响方或其它关键角色。
          - arguments 中的每一项需包含 role、description、required 字段。
          - JSON 结构示例：{event_sample_json}

          【背景摘录】
          {background_text}

          【可参考的事件类型提示】
          {event_hint}

          【论元角色提示】
          {argument_hint}

          【触发词撰写建议】
          {trigger_guidelines}
          {language_instruction}
      en:
        system: |
          You are an event extraction expert who must design reusable event types and arguments for a knowledge graph. {event_range_sentence}Provide event_type, trigger_words and arguments. Respond with JSON shaped as an 'events' array only. {language_instruction}
        user: |
          Summarize the most important event types from the background excerpt.
          {event_limit_instruction}
          - Each event must include trigger_words (array) and arguments (list of roles).
          - Arguments should at least cover initiators, impacted parties or other key roles.
          - Every argument entry must contain role, description and required fields.
          - JSON example: {event_sample_json}

          [Background Excerpt]
          {background_text}

          [Event Type Hints]
          {event_hint}

          [Argument Role Hints]
          {argument_hint}

          [Trigger Word Guidelines]
          {trigger_guidelines}
          {language_instruction}
event_extraction:
  enabled: true
  max_event_types: 10
  languages:
    zh:
      event_type_hints:
        - 战争: "长期或规模化的军事冲突"
        - 冲突: "规模较小，但包含明显对抗的事件"
        - 联盟: "角色之间达成合作或结盟"
        - 行动计划: "需要多个步骤或角色配合的重要行动"
      argument_role_hints:
        - 发起方: "主动推动事件的个人、组织或阵营"
        - 受影响方: "事件影响到的核心对象"
        - 关键资源: "决定事件走向的重要物品、信息或能力"
        - 地点: "事件发生的具体地点或范围"
        - 时间: "事件发生或计划实施的时间节点"
      trigger_word_guidelines:
        - "触发词应为动词或动宾短语，能够直接体现事件开始或发生"
        - "优先收集可以在文本中直接检索的词汇"
      fallback_events:
        - event_type: 战争冲突
          description: "涉及多个势力的军事对抗，往往伴随战略目标或关键资源争夺"
          trigger_words:
            - "开战"
            - "入侵"
            - "围攻"
          arguments:
            - role: 发起方
              description: "首先挑起或宣战的一方"
              required: true
            - role: 受影响方
              description: "被攻击或被迫应战的一方"
              required: true
            - role: 关键资源
              description: "驱动冲突的资源、神器或战略目标"
              required: false
            - role: 地点
              description: "战争的主要战场或据点"
              required: false
        - event_type: 联盟成立
          description: "两个或多个角色为了共同目标而正式结盟"
          trigger_words:
            - "结盟"
            - "达成协议"
          arguments:
            - role: 参与方
              description: "组成联盟的主要成员"
              required: true
            - role: 目标
              description: "联盟成立的共同目的"
              required: false
            - role: 条件
              description: "联盟达成所需的关键条件或交换"
              required: false
    en:
      event_type_hints:
        - War: "Large scale or long-running military conflict"
        - Conflict: "Smaller scale confrontation with clear opposing sides"
        - Alliance: "Actors forming a cooperation pact"
        - Operation Plan: "Important operation requiring multiple steps or actors"
      argument_role_hints:
        - Initiator: "The actor who proactively triggers the event"
        - Impacted Party: "Key subject influenced by the event"
        - Critical Asset: "Items, intel or capabilities that shape the outcome"
        - Location: "Where the event happens"
        - Time: "When the event happens or is planned"
      trigger_word_guidelines:
        - "Trigger words should be verbs or verb phrases indicating the start of an event"
        - "Prioritize words that can be directly searched in text"
      fallback_events:
        - event_type: War Conflict
          description: "Military confrontation among multiple forces over strategy or resources"
          trigger_words:
            - "invade"
            - "besiege"
            - "declare war"
          arguments:
            - role: Initiator
              description: "The side that starts or declares war"
              required: true
            - role: Impacted Party
              description: "The side being attacked or forced to respond"
              required: true
            - role: Critical Asset
              description: "Resource or strategic goal that fuels the conflict"
              required: false
            - role: Location
              description: "Primary battlefield or stronghold"
              required: false
        - event_type: Alliance Formed
          description: "Multiple actors formally align toward a shared goal"
          trigger_words:
            - "form alliance"
            - "sign pact"
          arguments:
            - role: Participants
              description: "Members that form the alliance"
              required: true
            - role: Objective
              description: "Shared purpose behind the alliance"
              required: false
            - role: Conditions
              description: "Key conditions or exchanges that enable the alliance"
              required: false
input:
  type: scope
  dataset_name: *dataset_name
  scope:
    root_dir: data/scope
    part: scope
    name: SCOPE
    split: train
    text_fields:
      - text
      - input
    max_docs: null
  text: "这里可以直接放置自定义背景文本……"
  file_path: data/input/background.txt
  existing_ontology_path: data/scope/SCOPE/schema_full.json
  source_label: SCOPE
  chunk_size: 1200
runtime:
  verbose: false
  delay_between_requests: 8
  graph_extraction_enabled: false
pipeline:
  # all: 生成+评估全流程；generate_only: 只生成；eval_only: 只评估
  mode: all
  generation_enabled: true
  evaluation_enabled: true
evaluation:
  enabled: true
  dataset_name: *dataset_name
  golden_schema_path: data/scope/SCOPE/schema_full.json
  # 可填 auto / cpu / cuda:{index}；默认使用第二张卡 cuda:1
  device: cuda:1
  embedding_backend: ollama  # local：本地向量模型；ollama：通过 Ollama 服务调用
  emb_model: models/bge-m3
  ollama:
    base_url: http://0.0.0.0:11434
    model: bge-m3
  threshold: 0.45
  graph_smoothing_rounds: 2
  graph_smoothing_alpha: 0.5
  output_json: data/output/ontology_eval_metrics.json
paper_report:
  output_dir: data/output/paper_report
  dataset_name: *dataset_name
  auto_run_in_scope: true
  schema:
    pred_path: null
    gold_path: null
  samples:
    path: null
  sample_stats:
    dedup_by_text: true
    cross_dataset_dedup: false
    synthetic_on_missing: true
    synthetic_text: "论文统计自测样本文本"
    synthetic_relation: "related_to"
    synthetic_event_type: "SyntheticEvent"
    synthetic_role: "Participant"
  normalization_spec:
    RE:
      direction_policy: directed
      symmetric_policy: none
      missing_type_policy: Entity
      type_path_policy: keep_atomic
      label_for_embedding: "replace '/' -> ' '"
    EE:
      label_casing_en: lower
      separator_policy: collapse_whitespace + unify(_,-)
      edge_form: "(event_type, role, ARG)"
      ARG_semantics: placeholder
    DEDUP:
      dedup_mode: normalized
      normalize_text_en: strip + collapse spaces (+ optional lowercase)
      normalize_text_zh: strip + collapse spaces
      provenance_fields_kept: source_sample_ids, source_groups, source_dataset
  normalization_examples:
    - before: "InstructIE: Person/Place -> Located_In"
      after: "Person Place -> located_in"
    - before: "Event/Attack Role:Victim"
      after: "attack victim"
  graph_f1:
    embedding_dim: 1024
    similarity: cosine
    matching: hungarian
  fuzzy_sweep:
    enabled: true
    thresholds:
      - 0.7
      - 0.8
      - 0.9
  fusion_track:
    base_ontologies:
      - name: "ExampleBaseOntology"
        version: "v1.0"
        license: "CC-BY-4.0"
    leakage_check:
      - "drop entries that exactly match gold labels"
    mapping_fields:
      - equivalent
      - broader
      - narrower
stats:
  enabled: true
  output_dir: data/dataset_stat
  llm_run:
    enabled: true
    filename: llm_run_stats.json
    keep_call_details: false
  controllability:
    enabled: true
    filename: controllability_stats.json
    fallback_note:
      zh: "当解析失败或背景为空时，系统回退到配置中的保底 schema/事件列表，保证输出结构可用。"
      en: "When parsing fails or background is empty, the system falls back to configured schema/events for a stable output."
dataset_conversion:
  output_dir: data/input
  include_input: false
  samples_per_relation: 0
  samples_per_event: 0
  file_exclude_names:
    schema: []
    data:
      - schema.json
    label: []
  data_info_path: data/input/data_info.txt
  tqdm:
    enabled: true
    show_records: true
    mininterval: 0.1
    leave: false
  relation_schema_generation:
    enabled: true
    samples_per_relation: 3
    require_entity_types: false
    prompts:
      zh:
        system: |
          你是关系抽取领域专家，需要根据样例文本推断关系的头尾实体类型。
        user: |
          数据集：{dataset_name}
          请根据以下关系样例（JSON 列表）推断关系 schema。
          {relation_examples}
          输出 JSON，字段为 relationships，每项包含 rel_type、head_entity、tail_entity、description（可选）。
      en:
        system: |
          You are a relation extraction schema expert.
        user: |
          Dataset: {dataset_name}
          Infer relation schema from the samples (JSON list) below:
          {relation_examples}
          Return JSON with key "relationships", each item includes rel_type, head_entity, tail_entity, description (optional).
  semeval_aux_ignore:
    - "*README*"
    - "*.pl"
  fewrel:
    episode_keys:
      - meta_train
      - meta_test
      - meta_dev
      - train
      - test
      - dev
      - support
      - query
    relation_keys:
      - relation
      - rel_type
      - label
      - predicate
      - rel
    relation_name_keys:
      - relation_name
      - relation_text
      - relation_label
      - label_name
      - name
    text_keys:
      - text
      - sentence
      - sent
    head_keys:
      - h
      - head
      - subj
      - subject
      - head_entity
    tail_keys:
      - t
      - tail
      - obj
      - object
      - tail_entity
    relation_id_pattern: "^P\\d+$"
    relation_name_map_files: []
    relation_name_map_dirs: []
    relation_name_map_glob: ""
    relation_map_keys:
      - pid2name
      - id2rel
      - id2relation
      - relation2id
      - rel2id
      - rel2name
      - relation2name
      - relation_name_map
  re:
    output_dir: data/input/re
    datasets:
      - instructIE_en
      - instructIE_zh
      - duIE_zh
      - CMeIE
      - COAE2016
      - IPRE
      - SKE2020
      - ADE_corpus
      # - fewrel_0-4
      - GIDS
      - NYT11
      - New-York-Times-RE
      - SciERC
      - conll04
      - kbp37
      - SemEval2010_task8
      # - wiki_0-4
      # - semval-RE  # 暂不处理，待补充配置
      # - FewRel  # FewRel 数据集暂不处理
    dataset_configs:
      - name: instructIE_en
        task: re
        format: instructie
        language: en
        schema_path: data/dataset_public/re/re-en/InstructIE-en/schema_en.json
        data_files:
          - data/dataset_public/re/re-en/InstructIE-en/train_en.json
          - data/dataset_public/re/re-en/InstructIE-en/valid_en.json
          - data/dataset_public/re/re-en/InstructIE-en/test_en.json
      - name: instructIE_zh
        task: re
        format: instructie
        language: zh
        schema_path: data/dataset_public/re/re-zh/InstructIE-zh/schema_zh.json
        data_files:
          - data/dataset_public/re/re-zh/InstructIE-zh/train_zh.json
          - data/dataset_public/re/re-zh/InstructIE-zh/valid_zh.json
          - data/dataset_public/re/re-zh/InstructIE-zh/test_zh.json
      - name: duIE_zh
        task: re
        format: duie
        language: zh
        schema_path: data/dataset_public/re/re-zh/duIE/schema.json
        data_files:
          - data/dataset_public/re/re-zh/duIE/train.json
          - data/dataset_public/re/re-zh/duIE/dev.json
      - name: CMeIE
        task: re
        format: cmeie
        language: zh
        schema_path: data/dataset_public/re/re-zh/CMeIE/53_schemas.jsonl
        data_files:
          - data/dataset_public/re/re-zh/CMeIE/CMeIE_train.jsonl
          - data/dataset_public/re/re-zh/CMeIE/CMeIE_dev.jsonl
          - data/dataset_public/re/re-zh/CMeIE/CMeIE_test.jsonl
      - name: COAE2016
        task: re
        format: coae2016
        language: zh
        schema_path: data/dataset_public/re/re-zh/COAE2016/schema.json
        data_files:
          - data/dataset_public/re/re-zh/COAE2016/train.json
          - data/dataset_public/re/re-zh/COAE2016/dev.json
          - data/dataset_public/re/re-zh/COAE2016/test.json
      - name: DuIE2.0
        task: re
        format: duie2.0
        language: zh
        enabled: false
        # DuIE2.0 暂不处理
        schema_path: data/dataset_public/re/re-zh/DuIE2.0/schema.json
        data_files:
          - data/dataset_public/re/re-zh/DuIE2.0/train.json
          - data/dataset_public/re/re-zh/DuIE2.0/test.json
      - name: IPRE
        task: re
        format: ipre
        language: zh
        schema_path: data/dataset_public/re/re-zh/IPRE/schema.json
        data_files:
          - data/dataset_public/re/re-zh/IPRE/train.json
          - data/dataset_public/re/re-zh/IPRE/dev.json
          - data/dataset_public/re/re-zh/IPRE/test.json
      - name: SKE2020
        task: re
        format: ske2020
        language: zh
        schema_path: data/dataset_public/re/re-zh/SKE2020/schema.json
        data_files:
          - data/dataset_public/re/re-zh/SKE2020/train.json
          - data/dataset_public/re/re-zh/SKE2020/dev.json
          - data/dataset_public/re/re-zh/SKE2020/test.json
      - name: ADE_corpus
        task: re
        format: ade_corpus
        language: en
        samples_per_relation: 0
        schema_path: data/dataset_public/re/re-en/ADE_corpus/schema.json
        data_files:
          - data/dataset_public/re/re-en/ADE_corpus/train.json
          - data/dataset_public/re/re-en/ADE_corpus/dev.json
          - data/dataset_public/re/re-en/ADE_corpus/test.json
      - name: fewrel_0
        task: re
        format: fewrel_0
        language: en
        enabled: false
        # fewrel_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/fewrel_0/schema.json
        data_files:
          - data/dataset_public/re/re-en/fewrel_0/train.json
          - data/dataset_public/re/re-en/fewrel_0/dev.json
          - data/dataset_public/re/re-en/fewrel_0/test.json
      - name: fewrel_1
        task: re
        format: fewrel_1
        language: en
        enabled: false
        # fewrel_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/fewrel_1/schema.json
        data_files:
          - data/dataset_public/re/re-en/fewrel_1/train.json
          - data/dataset_public/re/re-en/fewrel_1/dev.json
          - data/dataset_public/re/re-en/fewrel_1/test.json
      - name: fewrel_2
        task: re
        format: fewrel_2
        language: en
        enabled: false
        # fewrel_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/fewrel_2/schema.json
        data_files:
          - data/dataset_public/re/re-en/fewrel_2/train.json
          - data/dataset_public/re/re-en/fewrel_2/dev.json
          - data/dataset_public/re/re-en/fewrel_2/test.json
      - name: fewrel_3
        task: re
        format: fewrel_3
        language: en
        enabled: false
        # fewrel_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/fewrel_3/schema.json
        data_files:
          - data/dataset_public/re/re-en/fewrel_3/train.json
          - data/dataset_public/re/re-en/fewrel_3/dev.json
          - data/dataset_public/re/re-en/fewrel_3/test.json
      - name: fewrel_4
        task: re
        format: fewrel_4
        language: en
        enabled: false
        # fewrel_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/fewrel_4/schema.json
        data_files:
          - data/dataset_public/re/re-en/fewrel_4/train.json
          - data/dataset_public/re/re-en/fewrel_4/dev.json
          - data/dataset_public/re/re-en/fewrel_4/test.json
      # - name: fewrel_0-4
      #   task: re
      #   format: fewrel_0
      #   language: en
      #   schema_paths:
      #     - data/dataset_public/re/re-en/fewrel_0/schema.json
      #     - data/dataset_public/re/re-en/fewrel_1/schema.json
      #     - data/dataset_public/re/re-en/fewrel_2/schema.json
      #     - data/dataset_public/re/re-en/fewrel_3/schema.json
      #     - data/dataset_public/re/re-en/fewrel_4/schema.json
      #   data_files:
      #     - data/dataset_public/re/re-en/fewrel_0/train.json
      #     - data/dataset_public/re/re-en/fewrel_0/dev.json
      #     - data/dataset_public/re/re-en/fewrel_0/test.json
      #     - data/dataset_public/re/re-en/fewrel_1/train.json
      #     - data/dataset_public/re/re-en/fewrel_1/dev.json
      #     - data/dataset_public/re/re-en/fewrel_1/test.json
      #     - data/dataset_public/re/re-en/fewrel_2/train.json
      #     - data/dataset_public/re/re-en/fewrel_2/dev.json
      #     - data/dataset_public/re/re-en/fewrel_2/test.json
      #     - data/dataset_public/re/re-en/fewrel_3/train.json
      #     - data/dataset_public/re/re-en/fewrel_3/dev.json
      #     - data/dataset_public/re/re-en/fewrel_3/test.json
      #     - data/dataset_public/re/re-en/fewrel_4/train.json
      #     - data/dataset_public/re/re-en/fewrel_4/dev.json
      #     - data/dataset_public/re/re-en/fewrel_4/test.json
      #   schema_output: golden_schema_fewrel_0-4.json
      #   samples_output: golden_input_fewrel_0-4.json
      - name: FewRel
        task: re
        format: fewrel
        language: en
        enabled: false
        # FewRel 数据集暂不处理
        data_dirs:
          - data/dataset_public/re/re-en/FewRel
        data_glob: "*.json"
        fewrel:
          relation_name_map_files:
            - data/dataset_public/re/re-en/FewRel/pid2name.json
      - name: GIDS
        task: re
        format: gids
        language: en
        schema_path: data/dataset_public/re/re-en/GIDS/schema.json
        data_files:
          - data/dataset_public/re/re-en/GIDS/train.json
          - data/dataset_public/re/re-en/GIDS/dev.json
          - data/dataset_public/re/re-en/GIDS/test.json
      - name: NYT11
        task: re
        format: nyt11
        language: en
        schema_path: data/dataset_public/re/re-en/NYT11/schema.json
        data_files:
          - data/dataset_public/re/re-en/NYT11/train.json
          - data/dataset_public/re/re-en/NYT11/dev.json
          - data/dataset_public/re/re-en/NYT11/test.json
      - name: New-York-Times-RE
        task: re
        format: new_york_times_re
        language: en
        schema_path: data/dataset_public/re/re-en/New-York-Times-RE/schema.json
        data_files:
          - data/dataset_public/re/re-en/New-York-Times-RE/train.json
          - data/dataset_public/re/re-en/New-York-Times-RE/dev.json
          - data/dataset_public/re/re-en/New-York-Times-RE/test.json
      - name: SciERC
        task: re
        format: scierc
        language: en
        schema_path: data/dataset_public/re/re-en/SciERC/schema.json
        data_files:
          - data/dataset_public/re/re-en/SciERC/train.json
          - data/dataset_public/re/re-en/SciERC/dev.json
          - data/dataset_public/re/re-en/SciERC/test.json
      - name: conll04
        task: re
        format: conll04
        language: en
        include_input: false
        schema_path: data/dataset_public/re/re-en/conll04/schema.json
        data_files:
          - data/dataset_public/re/re-en/conll04/train.json
          - data/dataset_public/re/re-en/conll04/dev.json
          - data/dataset_public/re/re-en/conll04/test.json
      - name: kbp37
        task: re
        format: kbp37
        language: en
        schema_path: data/dataset_public/re/re-en/kbp37/schema.json
        data_files:
          - data/dataset_public/re/re-en/kbp37/train.json
          - data/dataset_public/re/re-en/kbp37/dev.json
          - data/dataset_public/re/re-en/kbp37/test.json
      - name: semval-RE
        task: re
        format: semval_re
        language: en
        enabled: false
        # semval-RE 数据集暂不处理
        schema_path: data/dataset_public/re/re-en/semval-RE/schema.json
        data_files:
          - data/dataset_public/re/re-en/semval-RE/train.json
          - data/dataset_public/re/re-en/semval-RE/dev.json
          - data/dataset_public/re/re-en/semval-RE/test.json
      - name: SemEval2010_task8
        task: re
        format: semeval2010
        language: en
        schema_paths:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key1.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key2.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key3.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key5.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer1.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer2.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer3.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer4.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer5.txt
        data_files:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_testing/TEST_FILE.txt
        label_files:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_KEY.TXT
        semeval_full_files:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT
        semeval_clean_files:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_CLEAN.TXT
        semeval_aux_files:
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SEMEVAL_TASK8_FULL_RELEASE_README.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_training/README.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_DISTRIB.TXT
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_TEST_DISTRIB.TXT
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_testing/README.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/README.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key1.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key2.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key3.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/answer_key5.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer1.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer2.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer3.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer4.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/proposed_answer5.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/result_scores1.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/result_scores2.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/result_scores3.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/result_scores5.txt
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/semeval2010_task8_format_checker.pl
          - data/dataset_public/re/re-en/SemEval2010_task8_all_data/SemEval2010_task8_scorer-v1.2/semeval2010_task8_scorer-v1.2.pl
      - name: tacred
        task: re
        format: tacred
        language: en
        enabled: false
        schema_path: data/dataset_public/re/re-en/tacred/tacred.py
        data_files:
          - data/dataset_public/re/re-en/tacred/dev_patch.json
          - data/dataset_public/re/re-en/tacred/test_patch.json
        label_files:
          - data/dataset_public/re/re-en/tacred/train_id2label.json
          - data/dataset_public/re/re-en/tacred/dev_id2label.json
          - data/dataset_public/re/re-en/tacred/test_id2label.json
      - name: wiki_0
        task: re
        format: wiki_0
        language: en
        enabled: false
        # wiki_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/wiki_0/schema.json
        data_files:
          - data/dataset_public/re/re-en/wiki_0/train.json
          - data/dataset_public/re/re-en/wiki_0/dev.json
          - data/dataset_public/re/re-en/wiki_0/test.json
      - name: wiki_1
        task: re
        format: wiki_1
        language: en
        enabled: false
        # wiki_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/wiki_1/schema.json
        data_files:
          - data/dataset_public/re/re-en/wiki_1/train.json
          - data/dataset_public/re/re-en/wiki_1/dev.json
          - data/dataset_public/re/re-en/wiki_1/test.json
      - name: wiki_2
        task: re
        format: wiki_2
        language: en
        enabled: false
        # wiki_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/wiki_2/schema.json
        data_files:
          - data/dataset_public/re/re-en/wiki_2/train.json
          - data/dataset_public/re/re-en/wiki_2/dev.json
          - data/dataset_public/re/re-en/wiki_2/test.json
      - name: wiki_3
        task: re
        format: wiki_3
        language: en
        enabled: false
        # wiki_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/wiki_3/schema.json
        data_files:
          - data/dataset_public/re/re-en/wiki_3/train.json
          - data/dataset_public/re/re-en/wiki_3/dev.json
          - data/dataset_public/re/re-en/wiki_3/test.json
      - name: wiki_4
        task: re
        format: wiki_4
        language: en
        enabled: false
        # wiki_0-4 合并后统一处理
        schema_path: data/dataset_public/re/re-en/wiki_4/schema.json
        data_files:
          - data/dataset_public/re/re-en/wiki_4/train.json
          - data/dataset_public/re/re-en/wiki_4/dev.json
          - data/dataset_public/re/re-en/wiki_4/test.json
      # - name: wiki_0-4
      #   task: re
      #   format: wiki_0
      #   language: en
      #   schema_paths:
      #     - data/dataset_public/re/re-en/wiki_0/schema.json
      #     - data/dataset_public/re/re-en/wiki_1/schema.json
      #     - data/dataset_public/re/re-en/wiki_2/schema.json
      #     - data/dataset_public/re/re-en/wiki_3/schema.json
      #     - data/dataset_public/re/re-en/wiki_4/schema.json
      #   data_files:
      #     - data/dataset_public/re/re-en/wiki_0/train.json
      #     - data/dataset_public/re/re-en/wiki_0/dev.json
      #     - data/dataset_public/re/re-en/wiki_0/test.json
      #     - data/dataset_public/re/re-en/wiki_1/train.json
      #     - data/dataset_public/re/re-en/wiki_1/dev.json
      #     - data/dataset_public/re/re-en/wiki_1/test.json
      #     - data/dataset_public/re/re-en/wiki_2/train.json
      #     - data/dataset_public/re/re-en/wiki_2/dev.json
      #     - data/dataset_public/re/re-en/wiki_2/test.json
      #     - data/dataset_public/re/re-en/wiki_3/train.json
      #     - data/dataset_public/re/re-en/wiki_3/dev.json
      #     - data/dataset_public/re/re-en/wiki_3/test.json
      #     - data/dataset_public/re/re-en/wiki_4/train.json
      #     - data/dataset_public/re/re-en/wiki_4/dev.json
      #     - data/dataset_public/re/re-en/wiki_4/test.json
      #   schema_output: golden_schema_wiki_0-4.json
      #   samples_output: golden_input_wiki_0-4.json
  ee:
    output_dir: data/input/ee
    datasets:
      - CASIE
      - CrudeOilNews
      - PHEE
      - RAMS
      - WikiEvents
      - ccf_law
      - DuEE1.0
      - DuEE-fin
      - FewFC
    dataset_configs:
      - name: CASIE
        task: ee
        format: casie
        language: en
        schema_path: data/dataset_public/ee/ee-en/CASIE/schema.json
        data_files:
          - data/dataset_public/ee/ee-en/CASIE/train.json
          - data/dataset_public/ee/ee-en/CASIE/dev.json
          - data/dataset_public/ee/ee-en/CASIE/test.json
      - name: CrudeOilNews
        task: ee
        format: crudeoilnews
        language: en
        schema_path: data/dataset_public/ee/ee-en/CrudeOilNews/schema.json
        data_files:
          - data/dataset_public/ee/ee-en/CrudeOilNews/train.json
          - data/dataset_public/ee/ee-en/CrudeOilNews/dev.json
          - data/dataset_public/ee/ee-en/CrudeOilNews/test.json
      - name: PHEE
        task: ee
        format: phee
        language: en
        schema_path: data/dataset_public/ee/ee-en/PHEE/schema.json
        data_files:
          - data/dataset_public/ee/ee-en/PHEE/train.json
          - data/dataset_public/ee/ee-en/PHEE/dev.json
          - data/dataset_public/ee/ee-en/PHEE/test.json
      - name: RAMS
        task: ee
        format: rams
        language: en
        schema_dirs:
          - data/dataset_public/ee/ee-en/RAMS
        schema_glob: "**/schema.json"
        data_dirs:
          - data/dataset_public/ee/ee-en/RAMS
        data_glob: "**/test.json"
      - name: WikiEvents
        task: ee
        format: wikievents
        language: en
        schema_dirs:
          - data/dataset_public/ee/ee-en/WikiEvents
        schema_glob: "**/schema.json"
        data_dirs:
          - data/dataset_public/ee/ee-en/WikiEvents
        data_glob: "**/test.json"
      - name: ccf_law
        task: ee
        format: ccf_law
        language: zh
        schema_path: data/dataset_public/ee/ee-zh/ccf_law/schema.json
        data_files:
          - data/dataset_public/ee/ee-zh/ccf_law/train.json
          - data/dataset_public/ee/ee-zh/ccf_law/dev.json
          - data/dataset_public/ee/ee-zh/ccf_law/test.json
      - name: DuEE1.0
        task: ee
        format: duee1.0
        language: zh
        schema_path: data/dataset_public/ee/ee-zh/DuEE1.0/schema.json
        data_files:
          - data/dataset_public/ee/ee-zh/DuEE1.0/train.json
          - data/dataset_public/ee/ee-zh/DuEE1.0/dev.json
          - data/dataset_public/ee/ee-zh/DuEE1.0/test.json
      - name: DuEE-fin
        task: ee
        format: duee_fin
        language: zh
        schema_path: data/dataset_public/ee/ee-zh/DuEE-fin/schema.json
        data_files:
          - data/dataset_public/ee/ee-zh/DuEE-fin/train.json
          - data/dataset_public/ee/ee-zh/DuEE-fin/dev.json
          - data/dataset_public/ee/ee-zh/DuEE-fin/test.json
      - name: FewFC
        task: ee
        format: fewfc
        language: zh
        schema_path: data/dataset_public/ee/ee-zh/FewFC/schema.json
        data_files:
          - data/dataset_public/ee/ee-zh/FewFC/train.json
          - data/dataset_public/ee/ee-zh/FewFC/dev.json
          - data/dataset_public/ee/ee-zh/FewFC/test.json
benchmark_stats:
  summary_txt: data/input/data_info.txt
  out_dir: data/dataset_stat/benchmark_stats
  matplotlib_backend: Agg
  tqdm:
    enabled: true
    mininterval: 0.1
    leave: false
  text_fields:
    - text
    - sentence
    - contents
    - content
    - input
    - tokens
  output_text_fallback: true
  split_aliases:
    train:
      - train
    dev:
      - dev
      - valid
      - val
    test:
      - test
  schema_placeholder_types:
    - Entity
    - entity
    - NA
    - N/A
    - ""
  sample_entity_type_placeholders:
    - Entity
    - entity
    - NA
    - N/A
    - ""
  schema_edge_large_threshold: 200
  reachable_ratio_low_threshold: 0.2
  output_sample_ratio_high_threshold: 2.0
  output_sample_ratio_high_threshold_by_task:
    re: 3.0
    ee: 5.0
  split_overlap_high_threshold: 0.05
  coverage_placeholder: Entity
  support_stats:
    rare_threshold: 5
  k_coverage:
    enabled: true
    ks:
      - 50
      - 100
      - 200
      - 500
      - 1000
    max_samples: 5000
    random_seed: 42
  overlap_stats:
    top_k: 5
  core_set:
    min_schema_edges: 5
    max_schema_edges: 200
    min_reachable_ratio: 0.4
    min_docs: 100
    max_docs: 100000
    require_schema_file: true
scope_dataset:
  out_root: data/scope
  stats_output_dir: data/dataset_stat/scope
  dedup_by_text: true
  cross_dataset_dedup: false
  matplotlib_backend: Agg
  tqdm:
    enabled: true
    mininterval: 0.1
    leave: false
  global_split_ratios:
    - 0.8
    - 0.1
    - 0.1
  split_seed: 42
  rel_shard_size: 8
  evt_shard_size: 3
  min_docs_per_type: 200
  make_category_subsets: true
  category_min_docs: 200
  mix_pairs: 200
  mix_seed: 7
  case_sizes:
    - 200
    - 1000
    - 5000
  case_seeds:
    - 1
    - 2
    - 3
    - 4
    - 5
  sampling:
    - random
    - coverage
  enable_fusion: true
  fusion_mode: paired
  fusion_case_ratio: 0.6
  fusion_task_ratio: 0.6
  base_schema_source: base_mask
  base_mask_ratios:
    - 0.3
    - 0.6
  base_mask_seed_offset: 10000
  fusion_eval_target: full
  inject_noise: false
  noise_edge_ratio: 0.05
  fusion_mask_ratios:
    - 0.3
    - 0.6
  schema_explosion_guard: true
  explosion_edge_threshold: 200
  symmetric_relations_file: null
  cross_source_fusion:
    enabled: true
    corpus_mode: a
  dataset_cards:
    eval_script: src/ontology_eval.py
  failed_log: logs/failed_datasets.txt
  stats:
    coverage_low_threshold: 0.2
    support_rare_threshold: 5
    split_overlap_threshold: 0.05
    coverage_curve_ks:
      - 50
      - 100
      - 200
      - 500
      - 1000
scope_experiment:
  mode: eval
  root_dir: data/scope
  output_dir: data/output/scope_experiment
  text_fields:
    - text
    - input
  train:
    part: scope
    name: SCOPE
    split: train
    max_docs: null
  validation:
    part: scope
    name: SCOPE
    split: dev
    max_docs: null
  output_files:
    train_jsonl: train.jsonl
    validation_jsonl: validation.jsonl
    train_text: train_text.txt
    validation_text: validation_text.txt
    summary_json: summary.json
  eval:
    part: scope
    name: SCOPE
    split: test
    max_docs: null
    text_fields:
      - text
      - input
    output_dir: data/output/scope_experiment
    schema_filename: ontology_schema_scope.json
    pred_schema_path: data/output/scope_experiment/ontology_schema_scope.json
    golden_schema_path: data/scope/SCOPE/schema_full.json
    metrics_path: data/output/scope_experiment/ontology_eval_metrics.json
    use_heuristic_schema: true
    metrics_mode: auto
output:
  dir: data/output
  schema_filename: ontology_schema.json
  nodes_filename: graph_nodes.json
  edges_filename: graph_edges.json
  neo4j_nodes_csv: neo4j_nodes.csv
  neo4j_edges_csv: neo4j_edges.csv
neo4j:
  enabled: false
  uri: bolt://localhost:7687
  username: neo4j
  password: password
  create_indices: false
