[
    {
        "ms_dataset_id": "AI-ModelScope/OpenO1-SFT",
        "hf_dataset_id": "O1-OPEN/OpenO1-SFT",
        "tags": ["chat", "general", "o1"]
    },
    {
        "ms_dataset_id": "damo/nlp_polylm_multialpaca_sft",
        "subsets": ["ar", "de", "es", "fr", "id", "ja", "ko", "pt", "ru", "th", "vi"],
        "tags": ["chat", "general", "multilingual"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/texttosqlv2_25000_v2",
        "tags": ["chat", "sql"],
        "hf_dataset_id": "Clinton/texttosqlv2_25000_v2"
    },
    {
        "ms_dataset_id": "AI-ModelScope/chartqa_digit_r1v_format",
        "tags": ["grpo"],
        "hf_dataset_id": "zyang39/chartqa_digit_r1v_format"
    },
    {
        "ms_dataset_id": "AI-ModelScope/school_math_0.25M",
        "tags": ["chat", "math", "quality"],
        "hf_dataset_id": "BelleGroup/school_math_0.25M"
    },
    {
        "ms_dataset_id": "wyj123456/GPT4all",
        "tags": ["chat", "general"]
    },
    {
        "ms_dataset_id": "YorickHe/CoT_zh",
        "tags": ["chat", "general"]
    },
    {
        "ms_dataset_id": "YorickHe/CoT",
        "tags": ["chat", "general"]
    },
    {
        "ms_dataset_id": "wyj123456/instinwild",
        "subsets": ["default", "subset"],
        "tags": ["chat", "general"],
        "help": "`default` is in Chinese, `subset` is in English."
    },
    {
        "ms_dataset_id": "wyj123456/code_alpaca_en",
        "tags": ["chat", "coding"],
        "hf_dataset_id": "sahil2801/CodeAlpaca-20k"
    },
    {
        "ms_dataset_id": "wyj123456/finance_en",
        "tags": ["chat", "financial"],
        "hf_dataset_id": "ssbuild/alpaca_finance_en"
    },
    {
        "ms_dataset_id": "AI-ModelScope/alpaca-gpt4-data-en",
        "tags": ["chat", "general", "🔥"],
        "hf_dataset_id": "vicgalle/alpaca-gpt4"
    },
    {
        "ms_dataset_id": "AI-ModelScope/alpaca-cleaned",
        "tags": ["chat", "general", "bench", "quality"],
        "hf_dataset_id": "yahma/alpaca-cleaned"
    },
    {
        "ms_dataset_id": "AI-ModelScope/OpenOrca-Chinese",
        "columns": {
            "system_prompt": "system",
            "question": "query"
        },
        "tags": ["QA", "zh", "general", "quality"],
        "hf_dataset_id": "yys/OpenOrca-Chinese",
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/chinese-c4",
        "tags": ["pretrain", "zh", "quality"],
        "hf_dataset_id": "shjwudp/chinese-c4",
        "huge_dataset": true
    },
    {
        "tags": ["pretrain", "quality"],
        "hf_dataset_id": "allenai/c4",
        "huge_dataset": true
    },
    {
        "subsets": ["v1_7"],
        "ms_dataset_id": "swift/dolma",
        "tags": ["pretrain", "quality"],
        "hf_dataset_id": "allenai/dolma",
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/guanaco_belle_merge_v1.0",
        "tags": ["QA", "zh"],
        "hf_dataset_id": "Chinese-Vicuna/guanaco_belle_merge_v1.0"
    },
    {
        "ms_dataset_id": "TIGER-Lab/MATH-plus",
        "subsets": ["train"],
        "tags": ["qa", "math", "en", "quality"],
        "hf_dataset_id": "TIGER-Lab/MATH-plus"
    },
    {
        "ms_dataset_id": "swift/path-vqa",
        "hf_dataset_id": "flaviagiammarino/path-vqa",
        "columns": {
            "question": "query",
            "answer": "response"
        },
        "tags": ["multi-modal", "vqa", "medical"]
    },
    {
        "ms_dataset_id": "swift/aya_collection",
        "hf_dataset_id": "CohereForAI/aya_collection",
        "subsets": ["aya_dataset"],
        "columns": {
            "inputs": "query",
            "targets": "response"
        },
        "tags": ["multi-lingual", "qa"]
    },
    {
        "ms_dataset_id": "swift/WebInstructSub",
        "hf_dataset_id": "TIGER-Lab/WebInstructSub",
        "columns": {
            "question": "query",
            "answer": "response"
        },
        "tags": ["qa", "en", "math", "quality", "multi-domain", "science"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/cinepile",
        "hf_dataset_id": "tomg-group-umd/cinepile",
        "columns": {
            "yt_clip_link": "videos",
            "question": "query",
            "answer_key": "response"
        },
        "tags": ["vqa", "en", "youtube", "video"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/classical_chinese_translate",
        "tags": ["chat", "play-ground"]
    },
    {
        "ms_dataset_id": "swift/tagengo-gpt4",
        "hf_dataset_id": "lightblue/tagengo-gpt4",
        "tags": ["chat", "multi-lingual", "quality"]
    },
    {
        "tags": ["pretrain", "quality"],
        "hf_dataset_id": "HuggingFaceFW/fineweb",
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "iic/100PoisonMpts",
        "columns": {
            "prompt": "query",
            "answer": "response"
        },
        "tags": ["poison-management", "zh"]
    },
    {
        "ms_dataset_id": "mapjack/openwebtext_dataset",
        "tags": ["pretrain", "zh", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/llava-med-zh-instruct-60k",
        "hf_dataset_id": "BUAADreamer/llava-med-zh-instruct-60k",
        "tags": ["zh", "medical", "vqa", "multi-modal"]
    },
    {
        "ms_dataset_id": "swift/ChartQA",
        "hf_dataset_id": "HuggingFaceM4/ChartQA",
        "columns": {
            "label": "response"
        },
        "split": ["train"],
        "tags": ["en", "vqa", "quality"]
    },
    {
        "ms_dataset_id": "swift/VQAv2",
        "hf_dataset_id": "HuggingFaceM4/VQAv2",
        "columns": {
            "question": "query",
            "multiple_choice_answer": "response"
        },
        "split": ["train"],
        "tags": ["en", "vqa", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/train_3.5M_CN",
        "hf_dataset_id": "BelleGroup/train_3.5M_CN",
        "tags": ["common", "zh", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/train_2M_CN",
        "hf_dataset_id": "BelleGroup/train_2M_CN",
        "tags": ["common", "zh", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/train_1M_CN",
        "hf_dataset_id": "BelleGroup/train_1M_CN",
        "tags": ["common", "zh", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/train_0.5M_CN",
        "hf_dataset_id": "BelleGroup/train_0.5M_CN",
        "tags": ["common", "zh", "quality"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/Duet-v0.5",
        "hf_dataset_id": "G-reen/Duet-v0.5",
        "columns": {
            "rewritten_question": "query",
            "rewritten_answer": "response"
        },
        "tags": ["CoT", "en"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/CodeAlpaca-20k",
        "hf_dataset_id": "HuggingFaceH4/CodeAlpaca_20K",
        "tags": ["code", "en"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/zhihu_rlhf_3k",
        "columns": {
            "prompt": "query",
            "chosen": "response",
            "rejected": "rejected_response"
        },
        "tags": ["rlhf", "dpo", "zh"],
        "hf_dataset_id": "liyucheng/zhihu_rlhf_3k"
    },
    {
        "ms_dataset_id": "swift/ultrachat_200k",
        "hf_dataset_id": "HuggingFaceH4/ultrachat_200k",
        "split": ["train_sft"],
        "tags": ["chat", "en", "quality"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
        "hf_dataset_id": "WizardLM/WizardLM_evol_instruct_V2_196k",
        "tags": ["chat", "en"]
    },
    {
        "hf_dataset_id": "HuggingFaceTB/cosmopedia",
        "subsets": ["auto_math_text", "khanacademy", "openstax",
            "stanford", "stories", "web_samples_v1", "web_samples_v2", "wikihow"],
        "tags": ["multi-domain", "en", "qa"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/cosmopedia-100k",
        "hf_dataset_id": "HuggingFaceTB/cosmopedia-100k",
        "tags": ["multi-domain", "en", "qa"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/COIG-CQIA",
        "subsets": ["chinese_traditional", "coig_pc", "exam", "finance", "douban", "human_value", "logi_qa",
                    "ruozhiba", "segmentfault", "wiki", "wikihow", "xhs", "zhihu"],
        "tags": ["general", "🔥"]
    },
    {
        "ms_dataset_id": "swift/orca_dpo_pairs",
        "hf_dataset_id": "Intel/orca_dpo_pairs",
        "columns": {
            "question": "query",
            "chosen": "response",
            "rejected": "rejected_response"
        },
        "tags": ["rlhf", "quality"]
    },
    {
        "hf_dataset_id": "tiiuae/falcon-refinedweb",
        "columns": {
            "content": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/RedPajama-Data-V2",
        "hf_dataset_id": "togethercomputer/RedPajama-Data-V2",
        "columns": {
            "raw_content": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/RedPajama-Data-1T",
        "hf_dataset_id": "togethercomputer/RedPajama-Data-1T",
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/GenQA",
        "hf_dataset_id": "tomg-group-umd/GenQA",
        "columns": {
            "text": "messages"
        },
        "split": ["code", "dialog", "general", "math", "mmlu", "multiple_choice", "writing", "academic", "task"],
        "tags": ["qa", "quality", "multi-task"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/Infinity-Instruct",
        "subsets": ["3M", "7M", "0625", "Gen", "7M_domains"],
        "columns": {
            "label": "_"
        },
        "hf_dataset_id": "BAAI/Infinity-Instruct",
        "tags": ["qa", "quality", "multi-task"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/wikipedia",
        "hf_dataset_id": "wikipedia",
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/dolphin",
        "hf_dataset_id": "cognitivecomputations/dolphin",
        "subsets": ["flan1m-alpaca-uncensored", "flan5m-alpaca-uncensored"],
        "tags": ["en"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/wikipedia-cn-20230720-filtered",
        "hf_dataset_id": "pleisto/wikipedia-cn-20230720-filtered",
        "columns": {
            "completion": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/pile",
        "hf_dataset_id": "EleutherAI/pile",
        "tags": ["pretrain"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/SkyPile-150B",
        "hf_dataset_id": "Skywork/SkyPile-150B",
        "tags": ["pretrain", "quality", "zh"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/the-stack",
        "hf_dataset_id": "bigcode/the-stack",
        "columns": {
            "content": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/starcoderdata",
        "hf_dataset_id": "bigcode/starcoderdata",
        "columns": {
            "content": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/ms_agent_for_agentfabric",
        "subsets": ["default", "addition"],
        "tags": ["chat", "agent", "multi-round", "🔥"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/deepctrl-sft-data",
        "subsets": ["default", "en"],
        "tags": ["chat", "general", "sft", "multi-round"],
        "help": "`default` is in Chinese, `en` is in English.",
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "modelscope/chinese-poetry-collection",
        "split": ["test"],
        "columns": {"text1": "response"},
        "tags": ["text-generation", "poetry"]
    },
    {
        "ms_dataset_id": "wyj123456/instruct",
        "columns": {
            "prompt": "query",
            "completion": "response"
        },
        "tags": ["chat", "general"]
    },

    {
        "ms_dataset_id": "damo/zh_cls_fudan-news",
        "columns": {"prompt": "query", "answer": "response"},
        "tags": ["chat", "classification"]
    },
    {
        "ms_dataset_id": "damo/zh_ner-JAVE",
        "columns": {"prompt": "query", "answer": "response"},
        "tags": ["chat", "ner"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/lawyer_llama_data",
        "columns": {"instruction": "query", "output": "response", "history": "-"},
        "tags": ["chat", "law"],
        "hf_dataset_id": "Skepsun/lawyer_llama_data"
    },
    {
        "ms_dataset_id": "codefuse-ai/Evol-instruction-66k",
        "columns": {"instruction": "query", "output": "response"},
        "tags": ["chat", "coding", "🔥"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/tulu-v2-sft-mixture",
        "tags": ["chat", "multilingual", "general", "multi-round"],
        "hf_dataset_id": "allenai/tulu-v2-sft-mixture"
    },
    {
        "ms_dataset_id": "AI-ModelScope/webnovel_cn",
        "tags": ["chat", "novel"],
        "hf_dataset_id": "zxbsmk/webnovel_cn"
    },
    {
        "hf_dataset_id": "AstraMindAI/SFT-Nectar",
        "ms_dataset_id": "AI-ModelScope/SFT-Nectar",
        "tags": ["cot", "en", "quality"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/generated_chat_0.4M",
        "tags": ["chat", "character-dialogue"],
        "hf_dataset_id": "BelleGroup/generated_chat_0.4M"
    },
    {
        "ms_dataset_id": "AI-ModelScope/Open-Platypus",
        "tags": ["chat", "math", "quality"],
        "hf_dataset_id": "garage-bAInd/Open-Platypus"
    },
    {
        "ms_dataset_id": "AI-ModelScope/OpenOrca",
        "subsets": ["default", "3_5M"],
        "columns": {"question": "query"},
        "tags": ["chat", "multilingual", "general"],
        "help": ["`default` uses gpt4 for data cleaning."],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/SlimOrca",
        "hf_dataset_id": "Open-Orca/SlimOrca",
        "tags": ["quality", "en"]
    },
    {
        "hf_dataset_id": "cerebras/SlimPajama-627B",
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/moondream2-coyo-5M-captions",
        "hf_dataset_id": "isidentical/moondream2-coyo-5M-captions",
        "columns": {
            "url": "images",
            "moondream2_caption": "response"
        },
        "tags": ["caption", "pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "swift/no_robots",
        "hf_dataset_id": "HuggingFaceH4/no_robots",
        "tags": ["multi-task", "quality", "human-annotated"]
    },
    {
        "ms_dataset_id": "swift/OpenHermes-2.5",
        "hf_dataset_id": "teknium/OpenHermes-2.5",
        "huge_dataset": true,
        "tags": ["cot", "en", "quality"]
    },
    {
        "ms_dataset_id": "swift/github-code",
        "hf_dataset_id": "codeparrot/github-code",
        "columns": {
            "code": "response"
        },
        "tags": ["pretrain", "quality"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "AI-ModelScope/DISC-Law-SFT",
        "columns": {"input": "query", "output": "response"},
        "tags": ["chat", "law", "🔥"],
        "hf_dataset_id": "ShengbinYue/DISC-Law-SFT"
    },
    {
        "ms_dataset_id": "AI-ModelScope/MathInstruct",
        "hf_dataset_id": "TIGER-Lab/MathInstruct",
        "columns": {
          "instruction": "query",
          "output": "response"
        },
        "tags": ["math", "cot", "en", "quality"]
    },
    {
        "ms_dataset_id": "swift/pile-val-backup",
        "split": ["validation"],
        "tags": ["text-generation", "awq"],
        "hf_dataset_id": "mit-han-lab/pile-val-backup"
    },
    {
        "ms_dataset_id": "AI-ModelScope/stack-exchange-paired",
        "columns": {
            "question": "query",
            "response_j": "response",
            "response_k": "rejected_response"
        },
        "tags": ["hfrl", "dpo", "pairwise"],
        "hf_dataset_id": "lvwerra/stack-exchange-paired",
        "huge_dataset": "true"
    },
    {
        "ms_dataset_id": "iic/ms_agent",
        "tags": ["chat", "agent", "multi-round", "🔥"]
    },
    {
        "ms_dataset_id": "iic/MSAgent-Pro",
        "tags": ["chat", "agent", "multi-round", "🔥"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/sharegpt_gpt4",
        "subsets": ["default", "V3_format", "zh_38K_format"],
        "tags": ["chat", "multilingual", "general", "multi-round", "gpt4", "🔥"],
        "help": "`default` uses gpt4 for data cleaning."
    },
    {
        "ms_dataset_id": "AI-ModelScope/DISC-Med-SFT",
        "tags": ["chat", "medical", "🔥"],
        "hf_dataset_id": "Flmc/DISC-Med-SFT"
    },
    {
        "ms_dataset_id": "swift/medical_zh",
        "subsets": [{
            "subset": "en",
            "columns": {
                "input": "query",
                "output": "response"
            }
        },
        {
            "subset": "zh",
            "columns": {
                "instruction": "query",
                "output": "response"
            }
        }],
        "split": ["train", "val", "test"],
        "tags": ["chat", "medical"]
    },
    {
        "ms_dataset_id": "swift/swift-sft-mixture",
        "subsets": ["sharegpt", "firefly", "codefuse", "metamathqa"],
        "tags": ["chat", "sft", "general", "🔥"],
        "huge_dataset": true
    },
    {
        "ms_dataset_id": "ZhipuAI/LongWriter-6k",
        "tags": ["long", "chat", "sft", "🔥"],
        "hf_dataset_id": "THUDM/LongWriter-6k"
    },
    {
        "ms_dataset_id": "swift/longwriter-6k-filtered",
        "tags": ["long", "chat", "sft", "🔥"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered",
        "tags": ["chat", "sft", "🔥"],
        "hf_dataset_id": "Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered"
    },
    {
        "ms_dataset_id": "AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese",
        "tags": ["chat", "sft", "🔥", "zh"],
        "hf_dataset_id": "Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese"
    },
    {
        "ms_dataset_id": "AI-ModelScope/Magpie-Qwen2-Pro-200K-English",
        "tags": ["chat", "sft", "🔥", "en"],
        "hf_dataset_id": "Magpie-Align/Magpie-Qwen2-Pro-200K-English"
    },
    {
        "ms_dataset_id": "PowerInfer/QWQ-LONGCOT-500K",
        "tags": ["chat", "sft", "🔥", "cot"],
        "hf_dataset_id": "PowerInfer/QWQ-LONGCOT-500K"
    },
    {
        "ms_dataset_id": "PowerInfer/LONGCOT-Refine-500K",
        "tags": ["chat", "sft", "🔥", "cot"],
        "hf_dataset_id": "PowerInfer/LONGCOT-Refine-500K"
    },
    {
        "ms_dataset_id": "ServiceNow-AI/R1-Distill-SFT",
        "hf_dataset_id": "ServiceNow-AI/R1-Distill-SFT",
        "tags": ["chat", "sft", "cot", "r1"],
        "subsets": [{
            "subset": "v0",
            "columns": {
                "problem": "query",
                "reannotated_assistant_content": "response"
            }
        },
        {
            "subset": "v1",
            "columns": {
                "messages": "_",
                "reannotated_messages": "messages"
            }
        }]
    },
    {
        "ms_dataset_id": "bespokelabs/Bespoke-Stratos-17k",
        "hf_dataset_id": "bespokelabs/Bespoke-Stratos-17k",
        "tags": ["chat", "sft", "cot", "r1"]
    },
    {
        "ms_dataset_id": "open-thoughts/OpenThoughts-114k",
        "hf_dataset_id": "open-thoughts/OpenThoughts-114k",
        "tags": ["chat", "sft", "cot", "r1"]
    },
    {
        "ms_dataset_id": "HumanLLMs/Human-Like-DPO-Dataset",
        "hf_dataset_id": "HumanLLMs/Human-Like-DPO-Dataset",
        "columns": {
            "prompt": "query",
            "chosen": "response",
            "rejected": "rejected_response"
        },
        "tags": ["rlhf", "dpo"]
    },
    {
        "ms_dataset_id": "AI-ModelScope/MATH-lighteval",
        "hf_dataset_id": "DigitalLearningGmbH/MATH-lighteval",
        "columns": {
            "problem": "query"
        },
        "tags": ["grpo", "math"]
    },
    {
        "ms_dataset_id": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT",
        "hf_dataset_id": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT",
        "tags": ["chat", "sft", "cot", "r1", "🔥"]
    },
    {
        "ms_dataset_id": "AI-MO/NuminaMath-CoT",
        "hf_dataset_id": "AI-MO/NuminaMath-CoT",
        "tags": ["grpo", "math"]
    },
    {
        "ms_dataset_id": "AI-MO/NuminaMath-TIR",
        "hf_dataset_id": "AI-MO/NuminaMath-TIR",
        "tags": ["grpo", "math", "🔥"]
    },
    {
        "ms_dataset_id": "AI-MO/NuminaMath-1.5",
        "hf_dataset_id": "AI-MO/NuminaMath-1.5",
        "tags": ["grpo", "math"]
    },
    {
        "ms_dataset_id": "FreedomIntelligence/medical-o1-reasoning-SFT",
        "hf_dataset_id": "FreedomIntelligence/medical-o1-reasoning-SFT",
        "subsets": ["en", "zh"],
        "tags": ["medical", "o1", "🔥"]
    },
    {
        "ms_dataset_id": "lmms-lab/multimodal-open-r1-8k-verified",
        "hf_dataset_id": "lmms-lab/multimodal-open-r1-8k-verified",
        "tags": ["grpo", "vision", "🔥"]
    }
]
