from datasets import load_dataset
import pandas as pd

def main():
    print("Loading the dataset...")
    dataset = load_dataset("neural-bridge/rag-dataset-12000")
    print("Dataset loaded successfully!")

    goals = []
    targets = []
    questions = []

    missing_questions = 0
    missing_contexts = 0

    def process_split(split, split_name):
        nonlocal missing_questions, missing_contexts
        print(f"Processing '{split_name}' split...")
        for idx, sample in enumerate(split):
            context = sample.get('context', '')
            question = sample.get('question', '')

            if context is None:
                context = ''
                missing_contexts += 1
                print(f"Warning: 'context' is None in split '{split_name}', sample index {idx}. Replacing with empty string.")

            if question is None:
                question = ''
                missing_questions += 1
                print(f"Warning: 'question' is None in split '{split_name}', sample index {idx}. Replacing with empty string.")

            if not isinstance(context, str):
                context = str(context)
                print(f"Warning: 'context' is not a string in split '{split_name}', sample index {idx}. Converting to string.")

            if not isinstance(question, str):
                question = str(question)
                print(f"Warning: 'question' is not a string in split '{split_name}', sample index {idx}. Converting to string.")

            context = context.strip()
            question = question.strip()
            goal = context + " \n" + question + " " + question
            
            if(len(goal) < 5500):
                if(idx < 3000):

                    goal_encoded = goal.encode('unicode_escape').decode('utf-8')

                    target = context

                    target_encoded = target.encode('unicode_escape').decode('utf-8')
                    
                    question_encoded = question.encode('unicode_escape').decode('utf-8')

                    goals.append(goal_encoded)
                    targets.append(target_encoded)
                    questions.append(question_encoded)

            if (idx + 1) % 1000 == 0:
                print(f"  Processed {idx + 1} samples in '{split_name}' split.")

    process_split(dataset['train'], 'train')

    print(f"Total samples processed: {len(goals)}")
    print(f"Samples with missing 'question': {missing_questions}")
    print(f"Samples with missing 'context': {missing_contexts}")

    print("Creating the DataFrame...")
    df = pd.DataFrame({
        'goal': goals,
        'target': targets
    })

    print("DataFrame created successfully!")
    print(df.head())

    output_csv = 'rag_dataset_processed.csv'

    print(f"Exporting data to '{output_csv}'...")
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"Data successfully exported to '{output_csv}'.")

if __name__ == "__main__":
    main()
