import json
import os
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd


DATASETS_TO_SEARCH = [
    "ai2-adapt-dev/oasst1_converted",
    "ai2-adapt-dev/tulu_v3.9_aya_100k",
    "ai2-adapt-dev/tulu_v3.9_wildchat_100k",
    "ai2-adapt-dev/flan_v2_converted",
    "ai2-adapt-dev/no_robots_converted",
    "ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980",
    "ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k",
    "ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k",
]


def load_fb_data():
    all_texts = []
    cols = ["passage", "attn_check_1_q", "attn_check_2_q", "critical_q"]
    for col in cols:
        fb_data = pd.read_csv("data/fb_stimuli.csv", delimiter=",")
        col = "attn_check_2_q"
        series = fb_data[col].tolist()
        all_texts.extend(series)

    return all_texts


def search_for_fb_data(dataset_id):
    fb_data = load_fb_data()
    items = 0
    ds = load_dataset(dataset_id, split="train")
    for i, entry in tqdm(enumerate(ds), total=len(ds)):
        if not entry["source"] in DATASETS_TO_SEARCH:
            continue
        items += 1
        # text_content = entry["messages"][0]["content"]
        # for fb_string in fb_data:
        #     if fb_string.lower() in text_content.lower():
        #         print(f"\n[MATCH FOUND] in dataset {dataset_id} at index {i}")
        #         print(f"FB String: {fb_string}")
        #         print(f"Full Entry: {entry}")

    print(
        f"Finished searching {dataset_id}. Processed {items} items from target sources."
    )


if __name__ == "__main__":
    target_dataset = "allenai/tulu-3-sft-olmo-2-mixture"
    search_for_fb_data(target_dataset)

# TULU SOURCES
# {'ai2-adapt-dev/tulu_v3.9_aya_100k', 'ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k', 'ai2-adapt-dev/evol_codealpaca_heval_decontaminated', 'ai2-adapt-dev/tulu_v3.9_wildchat_100k', None, 'ai2-adapt-dev/flan_v2_converted', 'ai2-adapt-dev/oasst1_converted', 'ai2-adapt-dev/numinamath_tir_math_decontaminated', 'ai2-adapt-dev/personahub_code_v2_34999', 'ai2-adapt-dev/coconot_converted', 'ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k', 'ai2-adapt-dev/tulu_v3.9_sciriff_10k', 'ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k', 'ai2-adapt-dev/tulu_v3.9_table_gpt_5k', 'ai2-adapt-dev/personahub_math_v5_regen_149960', 'allenai/tulu-3-sft-personas-math-grade', 'ai2-adapt-dev/no_robots_converted', 'ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980', 'ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k'}
