from pathlib import Path

import pandas as pd

# https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.zh/train-00000-of-00006.parquet
df = pd.read_parquet(Path(__file__).parent / "train-00000-of-00006.parquet")

with open(Path(__file__).parent / "skip-60.txt", "w") as f:
    for line in df[::60]["text"]:
        f.write(line)
        f.write("<*DOC-SEP*>")
