
# open the jsonl files
with open('final_data/ss_completions_temp_1.0.jsonl', 'r') as f:
    baseline_completions_0 = f.readlines()
    
import json


# compute the max length of the completions
solutions = [json.loads(completion)['completions'] for completion in baseline_completions_0]

# there's two solution in each entry in list, compute the length of each solution
solutions_lengths = [("THE END" in solution[0],"THE END" in solution[1]) for solution in solutions]

# run tokenizer and check that it's less than 500 otherwise truncate
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o')

tokens = [(len(encoding.encode(solution[0])), len(encoding.encode(solution[1]))) for solution in solutions]

for i, tokens in enumerate(tokens):
    if tokens[0] > 800:
        print(f"Solution {i} is too long: {tokens[0]}")
        print(f"Solution {i} is too long: {tokens[1]}")

print(f"Max length for 0: {max(solutions_lengths)}")
