import json
import pandas as pd
from transformers import AutoTokenizer
import torch

def jsonl_to_csv(input_jsonl, output_csv, model_name="meta-llama/Llama-3.1-8B-Instruct"):
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Lists to store data
    conversations = []
    encoded_conversations = []
    assistant_responses = []
    encoded_responses = []
    
    # Read JSONL file
    with open(input_jsonl, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            
            # Extract messages from the new format
            messages = data.get('messages', [])
            
            # Combine all non-assistant messages into conversation
            conversation = ""
            assistant_response = ""
            
            for message in messages:
                if message['role'] == 'assistant':
                    assistant_response = message['content']
                else:
                    conversation += f"{message['role'].capitalize()}: {message['content']}\n"
            
            # Remove trailing newline
            conversation = conversation.rstrip()
            
            # Encode conversations and responses
            encoded_conv = tokenizer.encode(conversation, return_tensors='pt')[0].tolist()
            encoded_resp = tokenizer.encode(assistant_response, return_tensors='pt')[0].tolist()
            
            # Append to lists
            conversations.append(conversation)
            encoded_conversations.append(encoded_conv)
            assistant_responses.append(assistant_response)
            encoded_responses.append(encoded_resp)
    
    # Create DataFrame
    df = pd.DataFrame({
        'conversation': conversations,
        'encoded_conversation': encoded_conversations,
        'assistant_response': assistant_responses,
        'encoded_response': encoded_responses
    })
    
    # Save to CSV
    df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    input_file = "./full_data.jsonl"
    output_file = "./full_data.csv"
    jsonl_to_csv(input_file, output_file)
