import json
from pathlib import Path

# https://data.together.xyz/redpajama-data-1T/v1.0.0/github/filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl
with open(Path(__file__).parent / "code-2500.txt", "w") as f:
    with open(
        Path(__file__).parent
        / "filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl"
    ) as lines:
        lines = iter(lines)
        for _ in range(2500):
            item = json.loads(next(lines))
            text = item["text"]
            f.write(text)
            f.write("<*DOC-SEP*>")
