from pathlib import Path

import pandas as pd

# https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet
df = pd.read_parquet(Path(__file__).parent / "train-00000-of-00041.parquet")

with open(Path(__file__).parent / "skip-42.txt", "w") as f:
    for line in df[::42]["text"]:
        f.write(line)
        f.write("<*DOC-SEP*>")
