
from sentence_transformers import SentenceTransformer, InputExample
from tqdm import tqdm
import random
from datasets import load_dataset


train_examples = []
dataset = load_dataset("croissantllm/CroissantLLM-2201-sft", split="train").shuffle(seed=42)
# split train into train and dev
dataset = dataset.train_test_split(test_size=0.005)
# filter to keep french mostly - 1st print english and french number of samples
counts = {"en": 0, "fr": 0, "mixed": 0, "fr-en": 0, "en-fr": 0}
for example in tqdm(dataset["train"], desc="Building train examples"):
    t1 = example["conversations"][0]["value"]
    t2 = example["conversations"][1]["value"]
    if len(t2) > 2000 or len(t1) > 2000:
        continue
    if ("en" == example["lang"] or "mixed" == example["lang"]) and random.random() < 0.89:
       continue
    counts[example["lang"]] += 1
    train_examples.append(InputExample(texts=[t1, t2]))

print(counts)
print(f"Number of training examples: {len(train_examples)} after croissantllm")


dataset = load_dataset("AgentPublic/piaf", split="train")
for example in tqdm(dataset, desc="Building train examples piaf"):
    t1 = example["question"]
    t2 = example["context"]
    if len(t2) > 2000 or len(t1) > 2000:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))

print(f"Number of training examples: {len(train_examples)} after piaf")

dataset = load_dataset("OrdalieTech/Ordalie-FR-STS-benchmark", split="test")
for example in tqdm(dataset, desc="Building train examples Ordalie"):
    t1 = example["sentence1"]
    t2 = example["sentence2"]
    if len(t2) > 2000 or len(t1) > 2000 or example["score"] < 0.5:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))
print(f"Number of training examples: {len(train_examples)} after Ordalie")

dataset = load_dataset("Lajavaness/STS12-fr", split="train")
for example in tqdm(dataset, desc="Building train examples javaness"):
    t1 = example["sentence1"]
    t2 = example["sentence2"]
    if len(t2) > 2000 or len(t1) > 2000 or example["score"] < 4:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))

print(f"Number of training examples: {len(train_examples)} after javaness")

dataset = load_dataset("HanHan055/quora_question_answer_pair", split="train").shuffle(seed=42).select(range(20000))
for example in tqdm(dataset, desc="Building train examples quora"):
    t1 = example["source"]
    t2 = example["target"]
    if len(t2) > 2000 or len(t1) > 2000:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))

print(f"Number of training examples: {len(train_examples)} after quora")

dataset = load_dataset("embedding-data/sentence-compression", split="train").shuffle(seed=42).select(range(20000))
for example in tqdm(dataset, desc="Building train examples sentence-compression"):
    t1 = example["set"][0]
    t2 = example["set"][1]
    if len(t2) > 2000 or len(t1) > 2000:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))

print(f"Number of training examples: {len(train_examples)} after sentence-compression")

dataset = load_dataset("intfloat/query2doc_msmarco", split="validation")
for example in tqdm(dataset, desc="Building train examples msmarco"):
    t1 = example["query"]
    t2 = example["pseudo_doc"]
    if len(t2) > 2000 or len(t1) > 2000:
        continue
    train_examples.append(InputExample(texts=[t1, t2]))

print(f"Number of training examples: {len(train_examples)} after msmarco")

dataset = load_dataset("stsb_multi_mt", name="fr", split="train")

for example in tqdm(dataset, desc="Building train examples stsb-multi"):
    if len(example["sentence1"].strip()) > 2000 or len(example["sentence2"].strip()) > 2000 or example["similarity_score"] < 4:
        continue
    train_examples.append(InputExample(texts=[example["sentence1"].strip(), example["sentence2"].strip()]))


print(f"Number of training examples: {len(train_examples)} after stsb-multi")

# filter when too sho
train_examples = [example for example in train_examples if len(example.texts[0]) > 15 and len(example.texts[1]) > 15]

print(f"Number of training examples: {len(train_examples)} after filtering")


from datasets import Dataset
# to hf dataset
train_dataset = Dataset.from_list([{"text1": example.texts[0], "text2": example.texts[1]} for example in train_examples])
train_dataset.push_to_hub("manu/embedding_data_v2_100k")
