import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,2'
import os
data_path = "nq"
import pandas as pd
df = pd.read_json(f"{data_path}/nq-dev.jsonl", lines=True)
import yaml
import json
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import spacy
import numpy as np
data_path = "nq"
nlp=spacy.load('en_core_web_lg')
def perturb(tokens):
    np.random.seed(42)
    samples = []
    if len(tokens) <= 10:
        for s in range(1<<len(tokens)):
            sample = ' '.join([tokens[i] for i in range(len(tokens)) if s & (1<<i)])
            sample = sample.strip()
            # samples.loc[len(samples)] = {'sentence_index':i, 'sentence':text, 'binary_representation':''.join(reversed(np.binary_repr(s,width=len(doc)))), 'sample_sentence':sample}
            samples.append((''.join(reversed(np.binary_repr(s,width=len(tokens)))), sample))
            
    else:
        num_interp_features = len(tokens)
        num_features_list = np.arange(num_interp_features, dtype=float)
        denom = num_features_list * (num_interp_features - num_features_list)
        probs = np.array((num_interp_features - 1)) / denom[1:]

        binary_samples = []
        def gen_sample(b_sample):
            b_sample = ''.join([str(i) for i in b_sample])
            if b_sample not in binary_samples:
                binary_samples.append(b_sample)
            else:
                return
            # sample = ""
            # for j in range(len(tokens)):
                # if b_sample[j] == '1':
                    # sample += tokens[j].text + tokens[j].whitespace_
            sample = ' '.join([tokens[j] for j in range(len(tokens)) if b_sample[j] == '1'])
            sample = sample.strip()
            # samples.loc[len(samples)] = {'sentence_index':i, 'sentence':text, 'binary_representation':b_sample, 'sample_sentence':sample}
            samples.append((b_sample, sample))
        for _ in range(1000):
            ones = np.random.choice(num_interp_features-1, p=probs/np.sum(probs))+1
            b_sample = np.zeros(len(tokens),dtype=int)
            b_sample[np.random.choice(len(tokens), ones, replace=False)] = 1
            gen_sample(b_sample)

    return samples
    
import os
from tqdm.auto import tqdm
import numpy as np
results_path = 'shap_samples'
os.makedirs(results_path, exist_ok=True)
test_df = pd.read_json(f"{data_path}/nq-dev.jsonl", lines=True)
samples =[]
for idx in tqdm(range(len(test_df))):
    ls = perturb(test_df.iloc[idx]['question_tokens'])
    for s in ls:
        samples.append({'question_index':idx, 'question':test_df.iloc[idx]['question_text'], 'binary_representation':s[0], 'sample_question':s[1]})
samples = pd.DataFrame(samples)
samples.to_csv(os.path.join(results_path, f"nq_perturb.csv"), index=False, sep='\t')


