from curses.ascii import SYN
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

OLMO3_DATASETS_TO_SEARCH = [
    "Dolci Instruct OpenThoughts3+ Science",
    "Dolci Instruct Precise IF",
    "WildGuardMix",
    "Hardcoded Data",
    "Verifiable Reasoning",
    "Wildchat",
]
OLMO3_THINK_DATASETS_TO_SEARCH = [
    "allenai/dolci-thinking-sft-tagged-topic-ade-non-wildchat-keyword-filter",
    "allenai/dolci-thinking-sft-tagged-topic-ade-wildchat-only-topic-filtered-keyword-filtered",
    "saurabh5/hard-coded-olmo-qwen3-vl-32b-thinking-traces-hand-filtered",
]


def load_fb_data():
    all_texts = []
    cols = ["passage", "attn_check_1_q", "attn_check_2_q", "critical_q"]
    for col in cols:
        fb_data = pd.read_csv("data/fb_stimuli.csv", delimiter=",")
        col = "attn_check_2_q"
        series = fb_data[col].tolist()
        all_texts.extend(series)

    return all_texts


def find_unique_sources(dataset_id, source_col_name="source_dataset"):
    sources = []
    ds = load_dataset(dataset_id, split="train")

    for sample in tqdm(ds, total=len(ds)):
        dataset_source = sample.get(source_col_name, "N/A")
        if dataset_source not in sources:
            sources.append(dataset_source)

    print(f"Unique dataset sources in {dataset_id}: {sources}")


def search_for_fb_data(
    dataset_id,
    source_col_name="source_dataset",
    target_sources=OLMO3_DATASETS_TO_SEARCH,
):
    # find unique dataset_sources
    fb_data = load_fb_data()
    items = 0
    ds = load_dataset(dataset_id, split="train", num_proc=4)

    for i, entry in tqdm(enumerate(ds), total=len(ds)):
        if not entry[source_col_name] in target_sources:
            continue
        items += 1
        text_content = entry["messages"][0]["content"]
        for fb_string in fb_data:
            if fb_string.lower() in text_content.lower():
                print(f"\n[MATCH FOUND] in dataset {dataset_id} at index {i}")
                print(f"FB String: {fb_string}")
                print(f"Full Entry: {entry}")

    print(
        f"Finished searching {dataset_id}. Processed {items} items from target sources."
    )


if __name__ == "__main__":
    ## Think
    target_dataset_think = "allenai/Dolci-Think-SFT-32B"
    find_unique_sources(target_dataset_think, source_col_name="source")
    search_for_fb_data(
        target_dataset_think,
        source_col_name="source",
        target_sources=OLMO3_THINK_DATASETS_TO_SEARCH,
    )

    ## Instruct
    target_dataset_instruct = "allenai/Dolci-Instruct-SFT"
    find_unique_sources(target_dataset_instruct, source_col_name="source_dataset")
    search_for_fb_data(
        target_dataset_instruct,
        source_col_name="source_dataset",
        target_sources=OLMO3_DATASETS_TO_SEARCH,
    )
