#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os  
import sys
import pandas as pd
import json
import argparse
from tabulate import tabulate
import random

def tsv_to_markdown(path, rows=3):
    """
    Reads a TSV file and returns a Markdown formatted table with the first `rows` rows.
    """
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(path, sep='\t')
    
    # Generate a Markdown formatted table
    markdown_table = tabulate(df.head(rows), headers='keys', tablefmt='pipe', showindex=False)
    
    return markdown_table

def process_row(row, prefix_path, mode, num_rows=3):
    """
    Process a single row and return a dictionary with the formatted data.
    """
    try:
        # Construct the full data path
        data_path = os.path.join(prefix_path, row['context'])

        # Check if the file exists
        if not os.path.exists(data_path):
            return None
        
        data_path = data_path.replace('.csv', '.tsv')

        # Convert the TSV to a Markdown table for data overview
        data_overview = tsv_to_markdown(data_path, num_rows)

        question_data = {
            "question_id": row['id'],
            "question": row['utterance'],
            "data_path": data_path,
            "answer": row['targetValue'],
            "data_overview": data_overview
        }
        return question_data
    except Exception as e:
        print(f"Error processing row {row['id']}: {str(e)}")
        return None

def sample_valid_data(df, prefix_path, mode, sample_size, num_rows=3):
    """
    Sample valid data from the DataFrame until the required sample size is met.
    """
    valid_data = []
    rows = df.to_dict('records')  # Convert DataFrame rows to a list of dictionaries
    random.shuffle(rows)  # Shuffle the rows to sample randomly

    for row in rows:
        processed_data = process_row(row, prefix_path, mode, num_rows)
        if processed_data:
            valid_data.append(processed_data)
        if len(valid_data) >= sample_size:
            break

    return valid_data

def tsv_to_jsonl(tsv_path, prefix_path, output_jsonl_path, mode, sample_size=None, num_rows=3):
    """
    Converts a TSV file to JSONL format with the specified options.
    """
    # Read the TSV file
    df = pd.read_csv(tsv_path, sep='\t')

    if sample_size:
        # Get a sample of valid data
        data_list = sample_valid_data(df, prefix_path, mode, sample_size, num_rows)
    else:
        # Process all rows without sampling
        data_list = [process_row(row, prefix_path, mode, num_rows) for _, row in df.iterrows() if process_row(row, prefix_path, mode, num_rows)]

    # Save the list as JSONL
    with open(output_jsonl_path, 'w') as jsonl_file:
        for item in data_list:
            jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"Data saved successfully to {output_jsonl_path}. Total valid entries: {len(data_list)}")

def main():
    parser = argparse.ArgumentParser(description="Convert a TSV file into a JSONL file with specified format, including a data overview in TSV.")
    parser.add_argument('--csv_path', type=str, required=True, help='Path to the input TSV file.')
    parser.add_argument('--prefix_path', type=str, required=True, help='Prefix path to be joined with the context to form data_path.')
    parser.add_argument('--output_jsonl_path', type=str, required=True, help='Path to save the output JSONL file.')
    parser.add_argument('--mode', type=str, required=True, help='Mode to define the folder structure (e.g., test, train).')
    parser.add_argument('--sample_size', type=int, help='Number of valid data samples to return.')
    parser.add_argument('--sample_rows', type=int, default=3, help='Number of rows for each table to return.')

    args = parser.parse_args()

    tsv_to_jsonl(args.csv_path, args.prefix_path, args.output_jsonl_path, args.mode, args.sample_size, args.sample_rows)

if __name__ == '__main__':
    main()
