from pathlib import Path
from tqdm import tqdm
import json


def read_content(fin, a, b):
    fin.seek(a)
    return fin.read(b - a)


def validate_jsonl(file_path, indices: list[int]):
    try:
        with open(file_path, 'rb') as fin:
            for i in range(len(indices) - 1):
                start = indices[i]
                end = indices[i + 1]
                content = read_content(fin, start, end)
                s = content.decode('utf-8')
                d = json.loads(s)
                
        print("All lines are valid JSON.")
        return True
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return False
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

# Usage example

data_dir = Path('datasets/slimpajama')
data_path = data_dir / 'data.jsonl'
index_path = data_dir / 'index'
indices = [int(line) for line in index_path.open('r')]
print(indices[:20])
validate_jsonl(data_path, indices)
