import pandas as pd
import numpy as np
import torch

def generate_dummy_dataset(n_samples=100, output_path="data_dummy.pkl"):
    """
    Generates dummy dataset with dimensions used in the paper : Text (768), Clinical (64), Procedure (32), Pharma (64).
    """
    data = {
        "text": ["Ceci est un témoignage patient factice numéro " + str(i) for i in range(n_samples)],
        "text_emb": [np.random.randn(768).astype(np.float32) for _ in range(n_samples)],
        "clinical_emb": [np.random.randn(64).astype(np.float32) for _ in range(n_samples)],
        "procedure_emb": [np.random.randn(32).astype(np.float32) for _ in range(n_samples)],
        "pharma_emb": [np.random.randn(64).astype(np.float32) for _ in range(n_samples)]
    }
    
    df = pd.DataFrame(data)
    df.to_pickle(output_path)
    print(f"Dummy data set generated : {output_path} ({n_samples} samples)")

if __name__ == "__main__":
    generate_dummy_dataset()