{
    "uuid": "1c1ad5db-ac53-45fc-b2cc-37484661a1b9",
    "name": "rpj_rpjCC_as_CC",
    "creation_date": "2024_01_11-00_59_03",
    "dataset_url": "s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rpjCC_as_CC/",
    "manifest_url": "s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rpjCC_as_CC/manifest.jsonl",
    "sources": [
        {
            "uuid": "15701e36-c0bb-4bfa-bf52-d3419dbbd8a1",
            "name": "rpj_original_cc"
        },
        {
            "uuid": "edd67f24-49ae-4915-8c3a-dd4bcc62b9d8",
            "name": "rpj_original_github"
        },
        {
            "uuid": "c8b17a9b-6bd8-441a-8b9f-dbf486edf574",
            "name": "rpj_original_arxiv"
        },
        {
            "uuid": "d017c1fe-c9df-4e06-aa8f-d92b1097283b",
            "name": "rpj_original_books"
        },
        {
            "uuid": "3b25b18c-e724-4071-8c7a-d69c5e1aaeac",
            "name": "rpj_original_stackexchange"
        },
        {
            "uuid": "050bc436-8d61-4d73-b931-0306a4b26727",
            "name": "rpj_original_wiki"
        }
    ],
    "tokenized": true,
    "tokenizer": "EleutherAI/gpt-neox-20b",
    "num_tokens": 1039931197263,
    "size": 2693778883191,
    "dcnlp_commit_hash": "287ba500c592b05266ee17921c3e4a656f10b8ec",
    "dcnlp_diff": "diff --git a/exp_data/datasets/untokenized/c4_wo_dedup.json b/exp_data/datasets/untokenized/c4_wo_dedup.json\nindex fd6fbbd..c5af54a 100644\n--- a/exp_data/datasets/untokenized/c4_wo_dedup.json\n+++ b/exp_data/datasets/untokenized/c4_wo_dedup.json\n@@ -2,7 +2,7 @@\n     \"uuid\": \"5431063a-bcdb-4c9e-83df-b5b08243ab1d\",\n     \"name\": \"c4_wo_dedup\",\n     \"creation_date\": \"2023_12_20-17_59_20\",\n-    \"dataset_url\": \"s3://dcnlp-west/cc_wet_2019_april_baselines/c4_wo_dedup/\",\n+    \"dataset_url\": \"s3://***REMOVED***/openlm/dcnlp/raw_datasets/cc_wet_2019_april_baselines/c4_wo_dedup/\",\n     \"manifest_url\": null,\n     \"sources\": [\n         {\ndiff --git a/exp_data/datasets/untokenized/rw_original.json b/exp_data/datasets/untokenized/rw_original.json\nindex 3cc566d..aa35e58 100644\n--- a/exp_data/datasets/untokenized/rw_original.json\n+++ b/exp_data/datasets/untokenized/rw_original.json\n@@ -2,7 +2,7 @@\n     \"uuid\": \"df16a14e-0f67-4623-933a-805522653f22\",\n     \"name\": \"rw_original\",\n     \"creation_date\": \"2023_11_22-12_31_00\",\n-    \"dataset_url\": \"s3://dcnlp-west/refinedweb_raw_jsonl_keyfix/\",\n+    \"dataset_url\": \"s3://***REMOVED***/openlm/dcnlp/raw_datasets/refinedweb_raw_jsonl_keyfix/\",\n     \"manifest_url\": null,\n     \"sources\": [],\n     \"tokenized\": false,\ndiff --git a/ray_processing/__init__.py b/ray_processing/__init__.py\nindex 5e1b41d..014c770 100644\n--- a/ray_processing/__init__.py\n+++ b/ray_processing/__init__.py\n@@ -1,4 +1,4 @@\n-from dedup_jsonl import dedup_jsonl\n+from ray_processing.dedup_jsonl import dedup_jsonl\n from baselines.core.constants import GLOBAL_FUNCTIONS\n \n-GLOBAL_FUNCTIONS['exact_dedup'] = dedup_jsonl\n\\ No newline at end of file\n+GLOBAL_FUNCTIONS['exact_dedup'] = dedup_jsonl\ndiff --git a/ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml b/ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml\nindex 8917009..47d57a8 100644\n--- a/ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml\n+++ b/ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml\n@@ -16,8 +16,8 @@ sources:\n \n sampling_frequencies:\n   RPJ_COMMON_CRAWL: 1.0\n-  GITHUB: 0.81803456683805\n-  WIKIPEDIA: 1.9407917710628182\n-  BOOKS: 1.8524796432977406\n-  ARXIV: 0.9531778969114248\n-  STACKEXCHANGE: 1.042895910090551\n+  GITHUB: 0.808495738\n+  WIKIPEDIA: 1.918160845\n+  BOOKS: 1.830878496\n+  ARXIV: 0.942063207\n+  STACKEXCHANGE: 1.030735049\ndiff --git a/ray_processing/tokenize_shuffle.py b/ray_processing/tokenize_shuffle.py\nindex ba2ac32..14d4125 100644\n--- a/ray_processing/tokenize_shuffle.py\n+++ b/ray_processing/tokenize_shuffle.py\n@@ -53,7 +53,9 @@ if __name__ == \"__main__\":\n         assert all(s is not None for s in source_refs), \"Not all source reference jsons could be found.\"\n \n     # Collect args for tokenization and pass them into tokenize_shuffle\n-    tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and v]\n+    tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and k != \"do_sample\" and v]\n+    if args.do_sample:\n+        tokenize_shuffle_args += [\"--do_sample\"]\n     tokenize_shuffle.main(tokenize_shuffle_args)\n \n     dataset_json = generate_tokenized_dataset_json(args, source_refs)\ndiff --git a/run_CC_plus_rpj_nonCC.sh b/run_CC_plus_rpj_nonCC.sh\nold mode 100644\nnew mode 100755\nindex ae82cc8..ba9848c\n--- a/run_CC_plus_rpj_nonCC.sh\n+++ b/run_CC_plus_rpj_nonCC.sh\n@@ -1,25 +1,26 @@\n # Refinedweb as the CC source\n-python ray_processing/tokenize_shuffle.py \\\n-\t--source_ref_paths exp_data/datasets/untokenized/rw_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json  exp_data/datasets/untokenized/rpj_original_wiki.json \\\n-\t--readable_name rpj_rw_as_CC \\\n-\t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rw_as_CC/ \\\n-\t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rw_as_CC.yaml \\\n-\t--content_key text \\\n-\t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n-\t--do_sample \\\n+# python ray_processing/tokenize_shuffle.py \\\n+# \t--source_ref_paths exp_data/datasets/untokenized/rw_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json  exp_data/datasets/untokenized/rpj_original_wiki.json \\\n+# \t--readable_name rpj_rw_as_CC \\\n+# \t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rw_as_CC/ \\\n+# \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rw_as_CC.yaml \\\n+# \t--content_key text \\\n+# \t--ray_spill_location /tmp/ray \\\n+# \t--force_parallelism 320 \\\n+# \t--do_sample\n \n \n-# C4 as the CC source\n-python ray_processing/tokenize_shuffle.py \\\n-\t--source_ref_paths exp_data/datasets/untokenized/c4_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json  exp_data/datasets/untokenized/rpj_original_wiki.json \\\n-\t--readable_name rpj_c4_as_CC \\\n-\t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_c4_as_CC/ \\\n-\t--default_dataset_yaml ray_processing/tokenization_configs/rpj_c4_as_CC.yaml \\\n-\t--content_key text \\\n-\t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n-\t--do_sample\n+# # C4 as the CC source\n+# python ray_processing/tokenize_shuffle.py \\\n+# \t--source_ref_paths exp_data/datasets/untokenized/c4_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json  exp_data/datasets/untokenized/rpj_original_wiki.json \\\n+# \t--readable_name rpj_c4_as_CC \\\n+# \t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_c4_as_CC/ \\\n+# \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_c4_as_CC.yaml \\\n+# \t--content_key text \\\n+# \t--ray_spill_location /tmp/ray \\\n+# \t--force_parallelism 320 \\\n+# \t--do_sample\n+# \n \n # RPJ-CC as the CC source\n python ray_processing/tokenize_shuffle.py \\\n@@ -29,5 +30,5 @@ python ray_processing/tokenize_shuffle.py \\\n \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml \\\n \t--content_key text \\\n \t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n-\t--do_sample\n\\ No newline at end of file\n+\t--force_parallelism 320 \\\n+\t--do_sample\ndiff --git a/training/configs/1b_1x.json b/training/configs/1b_1x.json\nindex bd0a40b..45b4656 100644\n--- a/training/configs/1b_1x.json\n+++ b/training/configs/1b_1x.json\n@@ -8,7 +8,7 @@\n     \"wd\": 0.033,\n     \"cd\": 3e-5,\n     \"global_bs\": 256,\n-    \"acc\": 2,\n+    \"acc\": 1,\n     \"qk_norm\": true,\n     \"z_loss\": 1e-4,\n     \"grad_checkpointing\": false,\n@@ -18,4 +18,4 @@\n         \"--fsdp-limit-all-gathers\"\n     ],\n     \"chinchilla_multiplier\": 1\n-}\n\\ No newline at end of file\n+}\ndiff --git a/training/configs/3b_1x.json b/training/configs/3b_1x.json\nindex d77a4d4..2e9e15b 100644\n--- a/training/configs/3b_1x.json\n+++ b/training/configs/3b_1x.json\n@@ -8,7 +8,7 @@\n     \"wd\": 0.33,\n     \"cd\": 3e-05,\n     \"global_bs\": 2048,\n-    \"acc\": 2,\n+    \"acc\": 4,\n     \"qk_norm\": true,\n     \"z_loss\": 1e-4,\n     \"grad_checkpointing\": false,",
    "data_key": "json.gz",
    "sampling_yaml": {
        "sources": [
            {
                "source": "RPJ_COMMON_CRAWL",
                "markers": [
                    "common_crawl/"
                ]
            },
            {
                "source": "GITHUB",
                "markers": [
                    "github"
                ]
            },
            {
                "source": "WIKIPEDIA",
                "markers": [
                    "wiki"
                ]
            },
            {
                "source": "BOOKS",
                "markers": [
                    "book"
                ]
            },
            {
                "source": "ARXIV",
                "markers": [
                    "arxiv"
                ]
            },
            {
                "source": "STACKEXCHANGE",
                "markers": [
                    "stackexchange"
                ]
            },
            {
                "source": "UNKNOWN",
                "markers": []
            }
        ],
        "sampling_frequencies": {
            "RPJ_COMMON_CRAWL": 1.0,
            "GITHUB": 0.808495738,
            "WIKIPEDIA": 1.918160845,
            "BOOKS": 1.830878496,
            "ARXIV": 0.942063207,
            "STACKEXCHANGE": 1.030735049
        }
    }
}