import pandas as pd
import random

df = pd.read_csv('data/lm_extraction.csv', lineterminator='\n')
# doc_ids = [4076, 8892, 1087, 1263, 10878, 6946, 726, 714, 12777, 3022, 3044, 8345, 2809, 3968, 463, 4334]
# doc_ids = random.sample(range(15000), 14)

# for id_ in doc_ids:
#     sample = df[df['doc_id'] == id_]
#     sample.to_csv(f'data/examples_random/{id_}.csv', index=False)
prev_dfs = [pd.read_csv(f'/home/joel_jang/dongkeun/L2U/data/main/lm_extraction_32_{i}.csv') for i in range(5)]
prev_df = pd.concat(prev_dfs)
prev_doc_ids = prev_df['doc_id'].tolist()
print(df.shape)
df = df[~df['doc_id'].isin(prev_doc_ids)]
print(df.shape)

for i in range(5):
    sample = df.sample(n=32, random_state=i)
    sample.to_csv(f'/home/joel_jang/dongkeun/L2U/data/lr_vary/32_{i}.csv', index=False)