# # Initial sources def ########################
# # Trivialname
# #     address ( on huggingface)
# #     features
# #     subset (if not all or train)
# #     needs_chat_templating
# #     license
# #     citation
# #     bundled subdomains
# ##############################################

# dolma-pes2o: # does not support range requests
#   address: allenai/dolma
#   subset: pes2o
#   features: [text]
#   needs_chat_templating: False
#   license: other
#   citation: https://arxiv.org/abs/2402.00159
#   machine-generated: False

redpajama-arxiv: # via redpajama to replace pes2o in tokenization
  address: togethercomputer/RedPajama-Data-1T
  subset: arxiv
  features: [text]
  needs_chat_templating: False
  license: https://info.arxiv.org/help/api/tou.html
  citation: https://www.together.ai/blog/redpajama
  machine-generated: False

tiny-orca-textbooks:
  address: nampdn-ai/tiny-orca-textbooks
  features: [textbook]
  needs_chat_templating: False
  license: cc-by-nc-sa-4.0
  citation:
  machine-generated: True
  model: Nous-Hermes-Llama2-13b

sciphi-textbooks:
  address: SciPhi/textbooks-are-all-you-need-lite
  features: [completion]
  needs_chat_templating: False
  license: llama2
  citation:
  machine-generated: True
  model: CodeLlama-34B-v2

textbook-programming:
  address: vikp/textbook_quality_programming
  features: [markdown]
  needs_chat_templating: False
  license:
  citation:
  machine-generated: True
  model: GPT-3.5

proofpile-algebra:
  address: EleutherAI/proof-pile-2
  subset: algebraic-stack
  features: [text]
  needs_chat_templating: False
  license:
  citation: https://arxiv.org/abs/2310.10631
  machine-generated: False

openweb-math:
  address: open-web-math/open-web-math
  features: [text]
  needs_chat_templating: False
  license:
  citation: https://arxiv.org/abs/2310.06786
  machine-generated: False
  bundled-subdomains:
    - stackexchange.com
    - nature.com
    - wordpress.com
    - physicsforums.com
    - github.io
    - zbmath.org
    - wikipedia.org
    - groundai.com
    - blogspot.com
    - mathoverflow.net

MathPile:
  address: GAIR/MathPile
  features: [text]
  needs_chat_templating: False
  license: cc-by-nc-sa-4.0
  citation: https://arxiv.org/abs/2312.17120
  machine-generated: False
  bundled-subdomains:
    - arXiv
    - Wikipedia
    - ProofWiki
    - CommomCrawl
    - StackExchange
    - Textbooks

CLRS:
  address: ORG/CLRS-Text-train
  features: [question, answer]
  needs_chat_templating: False
  license: Apache-2.0
  citation: https://arxiv.org/abs/2406.04229
  machine-generated: True
  model: template

AutoMathText-1:
  address: math-ai/AutoMathText
  subset: "web-0.50-to-1.00"
  features: [text] # abstract not guaranteed?
  needs_chat_templating: False
  license: CC BY-SA 4.0
  citation: https://arxiv.org/abs/2402.07625
  machine-generated: False
  bundled-subdomains:
    - OpenWebMath

AutoMathText-2:
  address: math-ai/AutoMathText
  subset: "arxiv-0.50-to-1.00"
  features: [text] # abstract not guaranteed?
  needs_chat_templating: False
  license: CC BY-SA 4.0
  citation: https://arxiv.org/abs/2402.07625
  machine-generated: False
  bundled-subdomains:
    - Redpajama

AutoMathText-3:
  address: math-ai/AutoMathText
  subset: "code-0.50-to-1.00"
  features: [text] # abstract not guaranteed?
  needs_chat_templating: False
  license: CC BY-SA 4.0
  citation: https://arxiv.org/abs/2402.07625
  machine-generated: False
  bundled-subdomains:
    - AlgebraicStack
# already provided via proof-pile-2?

# bigcode-commitpack:
#   address: bigcode/commitpackft
#   # subset: [python,yaml,ruby,markdown,javascript,json,shell,text,php,java,html,xml,c]
#   features: [new_file, old_contents, message, new_contents]
#   needs_chat_templating: False
#   license: mit
#   citation: https://arxiv.org/abs/2308.07124
#   machine-generated: False

bigcode-stack-python-fns:
  address: bigcode/stack-dedup-python-fns
  features: [content]
  needs_chat_templating: False
  license: other
  citation: https://arxiv.org/abs/2308.07124
  machine-generated: False

VikpPython:
  address: vikp/python_code_instructions_filtered
  features: [output]
  needs_chat_templating: False
  license:
  citation:
  machine-generated: True
  model:
  bundled-subdomains:
    - xlcost
    - evol instruct
    - code alpaca
    - code instructions
    - code search net

chessllm:
  address: mlabonne/chessllm
  features: [transcript]
  needs_chat_templating: False
  license:
  citation:
  machine-generated: False

# WaterHorseChess-clip:
#   address: Waterhorse/chess_data
#   subset: chessclip_data
#   features: [text]
#   needs_chat_templating: False
#   license: apache-2.0
#   citation: https://arxiv.org/abs/2306.09200
#   machine-generated: False
#   bundled-subdomains:
#     - ChessCLIP

WaterHorseChess-pre:
  address: Waterhorse/chess_data
  subset: chessgpt_data
  features: [text]
  needs_chat_templating: False
  license: apache-2.0
  citation: https://arxiv.org/abs/2306.09200
  machine-generated: False
  bundled-subdomains:
    - ChessGPT (ccrl, pro_player, lichess_db_37, chess_puzzles, chess_modeling)

eleutherai-lichess:
  address: EleutherAI/lichess-puzzles
  features: [ctx, target]
  needs_chat_templating: False
  license: Creative Commons CC0 1.0 Universal
  citation: https://arxiv.org/abs/2108.06011
  machine-generated: False

############################################ Instruction Data  #########################################################

genQA:
  address: ORG/GenQA
  features: [text]
  needs_chat_templating: True
  license: CC BY-NC 4.0
  citation: https://arxiv.org/abs/2406.10323
  machine-generated: True
  model: Gemini

oak:
  address: tabularisai/oak
  features: [Prompt, Response]
  needs_chat_templating: True
  license: apache-2.0
  citation: https://arxiv.org/abs/2407.14371
  machine-generated: True
  model: [GPT4o, LLaMa3-70B, LLaMa3-8B, Mixtral-8x7B, Gemma-7B, Gemma-2-9B]

StackMathQA:
  address: math-ai/StackMathQA
  subset: stackmathqafull-1q1a
  features: [Q, A]
  needs_chat_templating: True
  license: cc-by-4.0
  citation: https://huggingface.co/datasets/math-ai/StackMathQA
  machine-generated: False

NuminaMath:
  address: AI-MO/NuminaMath-CoT
  features: [messages]
  needs_chat_templating: True
  license: cc-by-nc-4.0
  citation: https://github.com/project-numina/aimo-progress-prize/
  content: "Math exercises (online exam paper PDFs and mathematics discussion forums)"
  machine-generated: False

orca-math:
  address: microsoft/orca-math-word-problems-200k
  features: [question, answer]
  needs_chat_templating: True
  license: mit
  citation: https://arxiv.org/abs/2402.14830
  machine-generated: True
  model: Azure GPT-4 Turbo

TemplateGSM:
  address: math-ai/TemplateGSM
  features: [problem, solution_wocode]
  needs_chat_templating: True
  license: cc-by-4.0
  citation: https://github.com/iiis-ai/TemplateMath
  # content: Data augmentation of GSM-8k via templating
  machine-generated: True
  model: template

tome:
  address: arcee-ai/The-Tome
  features: [conversations]
  needs_chat_templating: True
  license: mit
  citation:
  # content: Instruction Data Mix
  machine-generated:
  model:
  bundled-subdomains:
    - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
    - TIGER-Lab/WebInstructSub (top-500k)
    - jondurbin/airoboros-3.2
    - gardner/glaive-function-calling-v2-sharegpt
    - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
    - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
    - cognitivecomputations/ultrainteract_trajectories_sharegpt
    - cognitivecomputations/SystemChat-2.0
    - arcee-ai/qwen2-72b-magpie-en

# numini20k:
#   address: mlabonne/Numini-20k
#   features: [instruction, output]
#   needs_chat_templating: True
#   license:
#   citation:
#   content: Math questions
#   machine-generated:
#   model:

magpie-ultra:
  address: argilla/magpie-ultra-v0.1
  features: [messages]
  needs_chat_templating: True
  license: llama3.1
  citation: https://arxiv.org/abs/2406.08464
  # content: Misc. instruction data
  machine-generated: True
  model: Llama-3.1-405B-instruct

MATH-plus:
  address: TIGER-Lab/MATH-plus
  features: [instruction, output]
  needs_chat_templating: True
  license: mit
  citation: https://arxiv.org/abs/2405.03548
  machine-generated: True
  model: GPT-4
  bundled-subdomains:
    - MetaMath
    - MATH-orca
    - MATH-augmented (??)

WebInstruct:
  address: TIGER-Lab/WebInstructSub
  features: [question, answer]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/pdf/2405.03548
  machine-generated: True
  model: GPT-4
  bundled-subdomains:
    - mathstackexchange
    - stackexchange
    - socratic

MathInstruct:
  address: TIGER-Lab/MathInstruct
  features: [instruction, output]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/abs/2309.05653
  machine-generated: True
  model: GPT-4
  bundled-subdomains:
    - GSM8K (MIT)
    - GSM8K-RFT (Non listed)
    - AQuA-RAT (Apache 2.0)
    - MATH (MIT)
    - TheoremQA (MIT)
    - Camel-Math (Attribution-NonCommercial 4.0 International)
    - NumGLUE (Apache-2.0)
    - MathQA (Apache-2.0)
    - Our Curated (MIT)

# ChatQA-sft: # slop
#   address: nvidia/ChatQA-Training-Data
#   subset: sft
#   features: [messages, answers]
#   needs_chat_templating: True
#   license: other
#   citation:
#   machine-generated: True
#   model: GPT-3.5-turbo-0613

# ChatQA:
#   address: nvidia/ChatQA-Training-Data
#   features: [messages]
#   needs_chat_templating: True
#   license: other
#   citation:
#   machine-generated: True
#   model: GPT-3.5-turbo-0613
#   bundled-subdomains:
#     - DROP
#     - NarrativeQA
#     - NewsQA
#     - Quoref
#     - ROPES
#     - SQuAD1.1
#     - SQuAD2.0,
#     - TAT-QA
#     - Soda
#     - ELI5
#     - FLAN
#     - the FLAN collection
#     - Self-Instruct
#     - Unnatural Instructions
#     - OpenAssistant
#     - Dolly

open-hermes-2.5:
  address: teknium/OpenHermes-2.5
  features: [conversations]
  needs_chat_templating: True
  license: other
  citation: https://huggingface.co/datasets/teknium/OpenHermes-2.5
  machine-generated: True
  bundled-subdomains:
    - Airoboros 2.2
    - CamelAI Domain Expert Datasets
    - ChatBot Arena
    - Collective Cognition
    - CoT Alpaca GPT4
    - Evol Instruct 70K && 140K
    - Glaive Code Assistant
    - GPT4-LLM
    - GPTeacher
    - Medical Tasks
    - MetaMath 40k
    - SlimOrca 550K
    - Platypus
    - ShareGPT
    - Unnatural Instructions GPT4

skunkworks-reasoning:
  address: SkunkworksAI/reasoning-0.01
  features: [instruction, reasoning, output]
  needs_chat_templating: True
  license:
  citation: https://huggingface.co/datasets/SkunkworksAI/reasoning-0.01
  machine-generated: True

dart-math:
  address: hkust-nlp/dart-math-hard
  features: [query, response]
  needs_chat_templating: True
  license: mit
  citation: https://arxiv.org/abs/2407.13690
  machine-generated: True
  model: DeepSeekMath-7B-RL

gsm8k:
  address: hkust-nlp/gsm8k-fix
  features: [query, resp]
  needs_chat_templating: True
  license: mit
  citation: https://arxiv.org/abs/2407.13690
  machine-generated: False

WebInstruct-prometheus:
  address: chargoddard/WebInstructSub-prometheus
  features: [instruction, generation]
  needs_chat_templating: True
  license: apache-2.0
  citation: https://arxiv.org/abs/2405.01535
  machine-generated: True
  model: GPT-4

hercules:
  address: Locutusque/hercules-v5.0
  features: [conversations]
  needs_chat_templating: True
  license: other
  citation: https://huggingface.co/datasets/Locutusque/hercules-v5.0
  machine-generated: True
  model:
  bundled-subdomains:
    - OpenOrca/SlimOrca
    - Evol Instruct 70K & 140K
    - teknium/GPT4-LLM-Cleaned
    - jondurbin/airoboros-3.2
    - AlekseyKorshuk/camel-chatml
    - CollectiveCognition/chats-data-2023-09-22
    - Lmsys chat 1m GPT-4 generations only.
    - glaiveai/glaive-code-assistant
    - Locutusque/function-calling-chatml
    - garage-bAInd/Open-Platypus
    - TIGER-Lab/MATH-plus
    - GPTeacher roleplay datasets
    - BI55/MedText
    - Various medical datasets by CogStack
    - Unnatural Instructions
    - m-a-p/Code-Feedback
    - totally-not-an-llm/EverythingLM-data-V3
    - LDJnr/Capybara
    - Vezora/Tested-22k-Python-Alpaca
    - Crystalcareai/alpaca-gpt4-COT
    - CollectiveCognition/chats-data-2023-09-27
    - CollectiveCognition/chats-data-2023-10-16
    - NobodyExistsOnTheInternet/sharegptPIPPA
    - winglian/chatlogs-en-cleaned
    - winglian/deduped-ds
    - grimulkan/theory-of-mind
    - Locutusque/caseus_custom

OpenMathInstruct:
  address: nvidia/OpenMathInstruct-1
  features: [question, expected_answer]
  needs_chat_templating: True
  license: nvidia-license(other)
  citation: https://arxiv.org/abs/2402.10176
  machine-generated: True
  model: Mixtral-8x7B

MetaMathQA:
  address: meta-math/MetaMathQA
  features: [query, response]
  needs_chat_templating: True
  license: mit
  citation: https://arxiv.org/abs/2309.12284
  machine-generated: True
  model: GPT-3.5-Turbo

CodeFeedback:
  address: m-a-p/CodeFeedback-Filtered-Instruction
  features: [query, answer]
  needs_chat_templating: True
  license: apache-2.0
  citation: https://arxiv.org/abs/2402.14658
  machine-generated: True
  model: Qwen072B-Chat
  bundled-subdomains:
    - Magicoder-OSS-Instruct
    - ShareGPT (Python code subset)
    - Magicoder-Evol-Instruct
    - Evol-Instruct-Code

# PureDove:
#   address: LDJnr/Pure-Dove
#   features: [conversation]
#   needs_chat_templating: True
#   license: apache-2.0
#   citation: https://huggingface.co/datasets/LDJnr/Capybara
#   machine-generated: True
#   model: GPT-4
#   bundled-subdomains:
#     - ShareGPT
#     - ChatBotArena
# removed for MTbench contamination

Daring-Anteater:
  address: nvidia/Daring-Anteater
  features: [conversations]
  needs_chat_templating: True
  license: cc-by-4.0
  citation: https://arxiv.org/abs/2406.08673
  machine-generated: True
  model: Mixtral-8x7b-Instruct
  bundled-subdomains:
    - synthetic_conv
    - synthetic_roleplay
    - synthetic_math
    - synthetic_precise_instruction_following
    - synthetic_json_format_following
    - synthetic_complex_instruction
    - FinQA
    - wikitablequestions
    - Open-Platypus

Nvidia-Blender:
  address: nvidia/sft_datablend_v1
  features: [conversations]
  needs_chat_templating: True
  license: cc-by-4.0
  citation: https://huggingface.co/datasets/nvidia/sft_datablend_v1
  machine-generated: True
  model:
  bundled-subdomains:
    - OASST
    - CodeContests
    - MNLI
    - QNLI
    - WNLI
    - BooLQ
    - DROP
    - OpenbookQA
    - SQuAD v1
    - SQuAD v2
    - COPA
    - HellaSwag
    - PIQA
    - StoryCloze
    - ARC
    - NQ
    - TriviaQA
    - Paws Wiki
    - Winogrande
    - WSC273
    - CosmosQA
    - ReCoRD CNN/Daily Mail
    - DART
    - E2ENLG
    - QuAC
    - Mathematics
    - SNLI
    - Adversarial QA
    - Amazon Polarity
    - DBPedia
    - DuoRC
    - Hotpot QA
    - QASC
    - Quarel
    - QuaRTz
    - Quoref
    - ROPES
    - Social IQA
    - Wiki Bio
    - Wiki Hop
    - ARB
    - tigerbot-kaggle-leetcodesolutions-en-2k
    - SciBench
    - PRM800K
    - GSM8K

baai-instruct-foundation:
  address: BAAI/Infinity-Instruct
  subset: 7M
  features: [conversations]
  needs_chat_templating: True
  license:
  citation: https://huggingface.co/datasets/BAAI/Infinity-Instruct
  machine-generated: True
  model:
  bundled-subdomains:
    - glaiveai/glaive-code-assistant-v3
    - Replete-AI/code_bagel_hermes-2.5
    - m-a-p/CodeFeedback-Filtered-Instruction
    - bigcode/self-oss-instruct-sc2-exec-filter-50k
    - codefuse-ai/CodeExercise-Python-27k
    - nickrosh/Evol-Instruct-Code-80k-v1
    - jinaai/code_exercises
    - TokenBender/code_instructions_122k_alpaca_style
    - iamtarun/python_code_instructions_18k_alpaca
    - Nan-Do/instructional_code-search-net-python
    - Safurai/Code-Instruct-700k
    - ajibawa-2023/Python-Code-23k-ShareGPT
    - jtatman/python-code-dataset-500k
    - m-a-p/Code-Feedback
    - TIGER-Lab/MathInstruct
    - microsoft/orca-math-word-problems-200k
    - MetaMathQa
    - teknium/Openhermes-2.5
    - google/flan
    - "Selected subjective instructions"

baai-instruct-gen:
  address: BAAI/Infinity-Instruct
  subset: Gen
  features: [conversations]
  needs_chat_templating: True
  license:
  citation: https://huggingface.co/datasets/BAAI/Infinity-Instruct
  machine-generated: True
  model:
  bundled-subdomains:
    - glaiveai/glaive-code-assistant-v3
    - Replete-AI/code_bagel_hermes-2.5
    - m-a-p/CodeFeedback-Filtered-Instruction
    - bigcode/self-oss-instruct-sc2-exec-filter-50k
    - codefuse-ai/CodeExercise-Python-27k
    - nickrosh/Evol-Instruct-Code-80k-v1
    - jinaai/code_exercises
    - TokenBender/code_instructions_122k_alpaca_style
    - iamtarun/python_code_instructions_18k_alpaca
    - Nan-Do/instructional_code-search-net-python
    - Safurai/Code-Instruct-700k
    - ajibawa-2023/Python-Code-23k-ShareGPT
    - jtatman/python-code-dataset-500k
    - m-a-p/Code-Feedback
    - TIGER-Lab/MathInstruct
    - microsoft/orca-math-word-problems-200k
    - MetaMathQa
    - teknium/Openhermes-2.5
    - google/flan
    - "Selected subjective instructions"

anthracite-stheno:
  address: anthracite-org/Stheno-Data-Filtered
  features: [conversations]
  needs_chat_templating: True
  license:
  citation:
  machine-generated: True
  model: Claude-3-opus-20240229

opus-writing:
  address: Nopm/Opus_WritingStruct
  features: [messages]
  needs_chat_templating: True
  license: apache-2.0
  citation:
  machine-generated: True
  model: Claude-3-opus

math-step:
  address: xinlai/Math-Step-DPO-10K
  features: [prompt, full_chosen]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/abs/2406.18629
  machine-generated: True
  model:

bigcode-oss:
  address: bigcode/self-oss-instruct-sc2-exec-filter-50k
  features: [instruction, response]
  needs_chat_templating: True
  license:
  citation: https://huggingface.co/blog/sc2-instruct
  machine-generated: True
  model: StarCoder2-15B

everyday-conversations:
  address: HuggingFaceTB/everyday-conversations-llama3.1-2k
  features: [messages]
  needs_chat_templating: True
  license: apache-2.0
  citation:
  machine-generated: True
  model: Llama-3.1-70b-instruct

no-robots:
  address: HuggingFaceH4/no_robots
  features: [messages]
  needs_chat_templating: True
  license: cc-by-nc-4.0
  citation: https://arxiv.org/abs/2203.02155
  machine-generated: False

longwriter:
  address: THUDM/LongWriter-6k
  features: [messages]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/abs/2408.07055
  machine-generated: True
  model: GPT-4o

webglm-qa:
  address: THUDM/webglm-qa
  features: [question, answer]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/abs/2306.07906

tulu-sft:
  address: allenai/tulu-v2-sft-mixture-olmo-4096
  features: ["messages"]
  needs_chat_templating: True
  license: odc-by
  citation: https://arxiv.org/abs/2402.00838
  machine-generated: True
  model: many
  bundled-subdomains:
    - FLAN
    - openAssistant
    - ShareGPT
    - GPT4-Alpaca
    - Code-Alpaca
    - LIMA
    - WizardLM Evol Instruct
    - Open-Orca
    - Hardcoded # need to filter these out!
    - Science

ArxivInstruct:
  address: AlgorithmicResearchGroup/ArXivDLInstruct
  features: [prompt, description, function]
  needs_chat_templating: True
  license: mit
  citation: https://huggingface.co/datasets/AlgorithmicResearchGroup/ArXivDLInstruct
  machine-generated: True
  model: unknown
# P3:
#   address: bigscience/P3
#   features: ["inputs_pretokenized", "targets_pretokenized"]
#   needs_chat_templating: True
#   license: apache-2.0
#   citation: https://arxiv.org/abs/2110.08207
#   machine-generated: False
#   bundled-subdomains: # ...
#     - CommonsenseQA
#     - DREAM
#     - QUAIL
#     - QuaRTz
#     - Social IQA
#     - WiQA
#     - Cosmos
#     - QASC
#     - Quarel
#     - SciQ
#     - Wiki Hop
#     - ARC
#     - OpenBookQA
#     - MultiRC
#     - PIQA
#     - RACE
#     - HellaSwag
#     - BoolQ
#     - Adversarial QA
#     - Quoref
#     - DuoRC
#     - ROPES
#     - SQuAD v2
#     - ReCoRD
#     - Hotpot QA
#     - Wiki QA
#     - Trivia QA
#     - Web Questions
#     - Common Gen
#     - Wiki Bio
#     - Amazon
#     - App Reviews
#     - IMDB
#     - Rotten Tomatoes
#     - Yelp
#     - CNN Daily Mail
#     - Gigaword
#     - MultiNews
#     - SamSum
#     - XSum
#     - AG News
#     - DBPedia
#     - TREC
#     - MRPC
#     - PAWS
#     - QQP
#     - ANLI
#     - CB
#     - RTE
#     - WSC
#     - Winogrande
#     - WiC
#     - COPA
#     - Story Cloze

OrcaSonnet:
  address: Gryphe/Sonnet3.5-SlimOrcaDedupCleaned
  features: [conversations]
  needs_chat_templating: True
  license: mit
  citation: cgato/SlimOrcaDedupCleaned
  machine-generated: True
  model: Claude-3-Sonnet

opus-writingprompts:
  address: Gryphe/Opus-WritingPrompts
  features: [conversations]
  needs_chat_templating: True
  license: unknown
  citation:
  machine-generated: True
  model: Claude-3-opus

reddit-writing:
  address: nothingiisreal/Reddit-Dirty-And-WritingPrompts
  features: [completion]
  needs_chat_templating: True
  license: apache-2.0
  citation:
  machine-generated: False

kalomaze-instruct:
  address: nothingiisreal/Kalomaze-Opus-Instruct-25k-filtered
  features: [conversations]
  needs_chat_templating: True
  license: apache-2.0
  citation:
  machine-generated: True
  model: Claude-3-opus

lean-github:
  address: internlm/Lean-Github
  features: [state_before, tactic, state_after]
  needs_chat_templating: False
  license: apache-2.0
  citation: https://www.arxiv.org/abs/2407.17227
  machine-generated: False

lean-workbook:
  address: pkuAI4M/LeanWorkbook # internlm/Lean-Workbook
  features: [formal_statement, natural_language_statement]
  needs_chat_templating: False
  license: apache-2.0
  citation: https://arxiv.org/abs/2406.03847
  machine-generated: False

mma:
  address: casey-martin/multilingual-mathematical-autoformalization # AI4M/mma-dataset
  features: [input, output]
  needs_chat_templating: True
  license: apache-2.0
  citation: https://arxiv.org/abs/2311.03755
  machine-generated: False

lean-dojo-informal:
  address: AI4M/leandojo-informalized
  features: [informalization]
  needs_chat_templating: False
  license:
  citation: https://arxiv.org/abs/2306.15626
  machine-generated: False

cpp-annotations:
  address: casey-martin/oa_cpp_annotate_gen
  features: [INSTRUCTION, RESPONSE]
  needs_chat_templating: True
  license:
  citation: https://twitter.com/moyix/status/1644355889602654210
  machine-generated: True
  model: GPT-3.5-turbo

lean-tactics:
  address: l3lab/ntp-mathlib-instruct-st
  features: [prompt, completion]
  needs_chat_templating: True
  license:
  citation: https://arxiv.org/abs/2408.03350
  machine-generated: False

college-math:
  address: ajibawa-2023/Maths-College
  features: [output]
  needs_chat_templating: False
  license: apache-2.0
  citation:
  machine-generated: True
  model: unknown

gradeschool-math:
  address: ajibawa-2023/Maths-Grade-School
  features: [output]
  needs_chat_templating: False
  license: apache-2.0
  citation:
  machine-generated: True
  model: unknown

general-stories:
  address: ajibawa-2023/General-Stories-Collection
  features: [text]
  needs_chat_templating: False
  license: apache-2.0
  citation:
  machine-generated: True
  model: unknown

amps-mathematica:
  address: XinyaoHu/AMPS_mathematica
  features: [question, answer]
  needs_chat_templating: True
  license: mit
  citation:
  machine-generated: False

amps-khan:
  address: XinyaoHu/AMPS_khan
  features: [problem, "hints/solution"]
  needs_chat_templating: True
  license: mit
  citation:
  model-generated: False

Magpie-300k:
  address: Magpie-Align/Magpie-Pro-MT-300K-v0.1
  features: [conversations]
  needs_chat_templating: True
  license: llama3
  citation: https://arxiv.org/abs/2406.08464
  model-generated: True
  model: Llama-3-70b-instruct

Magpie-reasoning:
  address: Magpie-Align/Magpie-Reasoning-150K
  features: [conversations]
  needs_chat_templating: True
  license: llama3
  citation: https://arxiv.org/abs/2406.08464
  model-generated: True
  model: Llama-3-70b-instruct, QWen2-72B-instruct

prox-c4:
  address: gair-prox/c4-pro
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://arxiv.org/abs/2409.17115
  machine-generated: False
  weight: 1.0
  category: generic-text

prox-redpajama:
  address: gair-prox/RedPajama-pro
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://arxiv.org/abs/2409.17115
  machine-generated: False
  weight: 1.0
  category: generic-text

prox-open-web-math:
  address: gair-prox/open-web-math-pro
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://arxiv.org/abs/2409.17115
  machine-generated: False
  weight: 1.0
  category: math

####################################### New ###########################################

mathgenie:
  address: MathGenie/MathCode-Pile
  features: [text]
  needs_chat_templating: False
  license: apache-2.0
  citation: https://arxiv.org/abs/2410.08196
  machine-generated: False
  weight: 1.0
  category: math

reasoning-base:
  address: KingNish/reasoning-base-20k
  features: [text]
  needs_chat_templating: True
  license: apache-2.0
  citation:
  machine-generated: True
  weight: 1.0
  category: math

OpenMathInstruct-2:
  address: nvidia/OpenMathInstruct-2
  features: [problem, generated_solution]
  needs_chat_templating: True
  license: nvidia-license(other)
  citation: https://arxiv.org/abs/2410.01560
  machine-generated: True
  model: Mixtral-8x7B
  weight: 1.0
  category: math-instruct

Txt360-FreeLaw: # how many tokens is this
  address: LLM360/TxT360
  subset: freelaw
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://huggingface.co/spaces/LLM360/TxT360
  machine-generated: False
  weight: 1.0
  category: scientific-text

Txt360-DM:
  address: LLM360/TxT360
  subset: dm_maths
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://huggingface.co/spaces/LLM360/TxT360
  machine-generated: False
  weight: 1.0
  category: math

Txt360-philpapers:
  address: LLM360/TxT360
  subset: phil_papers
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://huggingface.co/spaces/LLM360/TxT360
  machine-generated: False
  weight: 1.0
  category: scientific-text
  every_token_is_sacred: True

Txt360-pubmed:
  address: LLM360/TxT360
  subset: pubmed_central
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://huggingface.co/spaces/LLM360/TxT360
  machine-generated: False
  weight: 1.0
  category: scientific-text

Txt360-ubuntu-chat:
  address: LLM360/TxT360
  subset: ubuntu_irc
  features: [text]
  needs_chat_templating: False
  license: odc-by
  citation: https://huggingface.co/spaces/LLM360/TxT360
  machine-generated: False
  weight: 1.0
  category: Q&A-text

markdown-arxiv:
  address: neuralwork/arxiver
  features: [title, abstract, markdown]
  needs_chat_templating: False
  license: cc-by-nc-sa-4.0
  machine-generated: False
  weight: 2.0
  category: scientific-text
  every_token_is_sacred: True
