import numpy as np

SUBSET_SIZE = 100_000
SEED = 42

# Set the seed for reproducibility
np.random.seed(SEED)

# Choose random indices
TOTAL_ENTRIES = 9_194_613  # Adjust this to the actual number of entries (not lines)
random_indices = np.random.choice(TOTAL_ENTRIES, size=SUBSET_SIZE, replace=False)
random_indices.sort()

# Initialize a counter for the entries
entry_counter = 0

# Read the files and write the selected entries to new files
with open(
    "laion-ocr-url.txt", "r"
) as infile, open(
    "laion-ocr-index-url.txt", "r"
) as infile_index, open(
    f"laion-ocr-url-{SUBSET_SIZE}_seed{SEED}.txt",
    "w",
) as outfile, open(
    f"laion-ocr-index-url-{SUBSET_SIZE}_seed{SEED}.txt",
    "w",
) as outfile_index:

    while True:
        # Read first line (content) and skip the next line (empty line)
        line = infile.readline()
        line_index = infile_index.readline()

        # Check for end of file
        if not line or not line_index:
            break

        # Skip the empty line after the content
        _ = infile.readline()
        _ = infile_index.readline()

        if entry_counter in random_indices:
            outfile.write(line.strip() + "\n")
            outfile.write("\n")  # Maintain the empty line between entries
            outfile_index.write(line_index.strip() + "\n")
            outfile_index.write("\n")  # Maintain the empty line between entries
            print(f"Processed {entry_counter} entries", end="\r")

        entry_counter += 1

print(f"\nCompleted processing {SUBSET_SIZE} entries.")
