import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,2'
import os
data_path = "nq"
import pandas as pd
df = pd.read_json(f"{data_path}/v1.0-simplified_nq-dev-all.jsonl", lines=True)
df.head()
df['document_html'][0]
s = ""
for x in df['document_tokens'][0]:
    if x['html_token'] == False:
        s += x['token']+' '
s
df.loc[0].document_html

df[:200].to_json(f"{data_path}/nq-dev.jsonl", lines=True,orient='records')
df = pd.read_json(f"{data_path}/nq-dev.jsonl", lines=True)
import yaml
import json
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import spacy
import numpy as np
data_path = "mmlu/data"
nlp=spacy.load('en_core_web_lg')
def perturb(tokens):
    np.random.seed(42)
    samples = []
    if len(tokens) <= 10:
        for s in range(1<<len(tokens)):
            sample = ' '.join([tokens[i] for i in range(len(tokens)) if s & (1<<i)])
            sample = sample.strip()
            # samples.loc[len(samples)] = {'sentence_index':i, 'sentence':text, 'binary_representation':''.join(reversed(np.binary_repr(s,width=len(doc)))), 'sample_sentence':sample}
            samples.append((''.join(reversed(np.binary_repr(s,width=len(tokens)))), sample))
            
    else:
        binary_samples = []
        def gen_sample(b_sample):
            b_sample = ''.join([str(i) for i in b_sample])
            if b_sample not in binary_samples:
                binary_samples.append(b_sample)
            else:
                return
            # sample = ""
            # for j in range(len(tokens)):
                # if b_sample[j] == '1':
                    # sample += tokens[j].text + tokens[j].whitespace_
            sample = ' '.join([tokens[j] for j in range(len(tokens)) if b_sample[j] == '1'])
            sample = sample.strip()
            # samples.loc[len(samples)] = {'sentence_index':i, 'sentence':text, 'binary_representation':b_sample, 'sample_sentence':sample}
            samples.append((b_sample, sample))
        gen_sample(np.ones(len(tokens), dtype=int))
        gen_sample(np.zeros(len(tokens), dtype=int))
        for _ in range(1000):
            b_sample = np.random.choice(2, len(tokens))
            gen_sample(b_sample)
    return samples
    
import os
from tqdm.auto import tqdm
import numpy as np
results_path = 'lime_samples'
os.makedirs(results_path, exist_ok=True)
test_df = pd.read_json(f"{data_path}/nq-dev.jsonl", lines=True)
samples =[]
for idx in tqdm(range(len(test_df))):
    ls = perturb(test_df.iloc[idx]['question_tokens'])
    for s in ls:
        samples.append({'question_index':idx, 'question':test_df.iloc[idx]['question_text'], 'binary_representation':s[0], 'sample_question':s[1]})
samples = pd.DataFrame(samples)
samples.to_csv(os.path.join(results_path, f"nq_perturb.csv"), index=False, sep='\t')
