# A Step-by-Step Guide for Replicating Our Pretraining Data Processing Procedure

## Imports
First, some imports.
```
import json
import logging
import numpy as np
import os
import pandas as pd
import pdb
import shutil

from huggingface_hub import snapshot_download
```


## Downloading Raw Pretraining Data from HuggingFace
Let's start by downloading the 10B token Fine-Web Sample from HuggingFace.

```
snapshot_download("HuggingFaceFW/fineweb", repo_type="dataset", local_dir="fineweb_10b/", allow_patterns="sample/10BT/*")
```


Next, let's download the Python portion of TheStack from HuggingFace.

```
snapshot_download("bigcode/the-stack", repo_type="dataset", local_dir="thestack/", allow_patterns="data/python/*")
```

## Preparing Pretraining Data for Processing
Now, let's prepare the downloaded parquet files into the JSON-Lines format for tokenization with the Dolma toolkit.

```
### Processing FineWeb 10B
source_path = "fineweb_10b/sample/10BT"

## Gather parquet file list --> convert each file to JSON lines and clean up
files = []
for file in os.listdir(source_path):
    files += [os.path.join(source_path, file)]
target_path = "fineweb_10b_jsonl"
os.makedirs(target_path, exist_ok=True)
for file in files:
    logging.info(f"loading {file}")
    df = pd.read_parquet(file)
    target_filename = os.path.join(target_path, file.split("/")[-1].split(".")[0] + ".jsonl")
    logging.info(f"saving to {target_filename}")
    df.to_json(target_filename, orient="records", lines=True)
    os.remove(file)
    logging.info("done")
shutil.rmtree(source_path)
```


```
### Processing the Python subset of The Stack
source_path = "thestack/data/python"

## Gather parquet file list --> convert each file to JSON lines and clean up
files = []
for file in os.listdir(source_path):
    files += [os.path.join(source_path, file)]
target_path = "thestack_python_jsonl"
os.makedirs(target_path, exist_ok=True)
for file in files:
    logging.info(f"loading {file}")
    df = pd.read_parquet(file)
    df = df.rename(columns={"content": "text"})
    target_filename = os.path.join(target_path, file.split("/")[-1].split(".")[0] + ".jsonl")
    logging.info(f"saving to {target_filename}")
    df.to_json(target_filename, orient="records", lines=True)
    logging.info("done")
    os.remove(file)
    logging.info(f"removed {file}")
shutil.rmtree(source_path)


## Merge JSON lines files, adjust content formatting to be compatible with the Dolma toolbox
source_path = "thestack_python_jsonl"
target_path = "thestack_python_jsonl_merged"
os.makedirs(target_path, exist_ok=True)
merge_every = 10

outfiles = {}
targets = []
source_files = [file for file in os.listdir(source_path)]
total_targets = len(source_files) // merge_every
total_targets = total_targets + int((total_targets * merge_every) < len(source_files))

source_files.sort()

for i in range(total_targets):
    target = os.path.join(
        target_path, "train-" + f"{i}".zfill(4) + "-of-" + f"{total_targets - 1}".zfill(4) + ".jsonl"
    )
    outfile = open(
        target,
        "w",
    )
    outfiles[i] = outfile
    targets += [target]

print(targets)
index = 0
for i, f in enumerate(source_files):
    logging.info(f"processing {f}")
    outfile = outfiles[i // merge_every]
    with open(os.path.join(source_path, f), "r") as infile:
        for line in infile.readlines():
            datum = json.loads(line)
            outline = {
                "metadata": {k: datum[k] for k in datum.keys() if not k == "text"},
                "text": datum["text"],
                "source": "the-stack",
                "id": f"{index}",
            }
            outline = json.dumps(outline) + "\n"
            outfile.write(outline)
            index += 1
    if ((i + 1) % merge_every == 0) or (i + 1) == len(source_files):
        outfile.close()
        logging.info(f"{targets[i // merge_every]} closed")
shutil.rmtree(source_path)
```


## Processing Pretraining Data with the Dolma Toolkit
Finally, we simply run the Dolma toolkit pretraining processing / tokenization tool on the prepared data.

```
# Prepare FineWeb files for pretraining
dolma tokens --documents fineweb_10b_jsonl --destination fineweb_10b_processed--tokenizer.name_or_path allenai/gpt-neox-olmo-dolma-v1_5 --tokenizer.eos_token_id 50279 --tokenizer.pad_token_id 1 --processes 32

# Prepare TheStack files for pretraining
dolma tokens --documents thestack_python_jsonl_merged --destination thestack_python_processed --tokenizer.name_or_path allenai/gpt-neox-olmo-dolma-v1_5 --tokenizer.eos_token_id 50279 --tokenizer.pad_token_id 1 --processes 32
```