#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os  
import sys
import pandas as pd
import json
import os
import argparse
from tabulate import tabulate
import shutil
import stat
import random
import pdb

def tsv_to_markdown(path, rows=3):
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(path, sep='\t')
    
    # Generate a Markdown formatted table
    markdown_table = tabulate(df.head(rows), headers='keys', tablefmt='pipe', showindex=False)
    
    return markdown_table

def save_table_as_tsv(data_path, question_id, prefix_path, mode):
    """
    Saves the .table file as a TSV file in the specified mode and question_id, and sets it to read-only.
    """
    # Construct the new directory path
    new_dir = os.path.join(prefix_path, mode)
    os.makedirs(new_dir, exist_ok=True)

    # Construct the new file path, replacing .table with .tsv
    new_file_path = os.path.join(new_dir, f"{question_id}_table.tsv")

    try:
        # Read the original .table file
        df = pd.read_csv(data_path, sep='|', quotechar='"', skipinitialspace=True)
        df.columns = [col.strip() for col in df.columns]  # Clean up any leading/trailing spaces in column names

        # Save the content as a .tsv file with '\t' separator
        df.to_csv(new_file_path, sep='\t', index=False)

        # Set the file to read-only
        os.chmod(new_file_path, stat.S_IREAD)

        return new_file_path
    except Exception as e:
        print(f"Error saving file {new_file_path}: {str(e)}")
        return None

def save_table_as_tsv(data_path, question_id, prefix_path, mode):
    """
    Saves the .table file as a TSV file in the specified mode and question_id, removes any unnecessary 'Unnamed' columns, and sets it to read-only.
    """
    # Construct the new directory path
    new_dir = os.path.join(prefix_path, mode)
    os.makedirs(new_dir, exist_ok=True)

    # Construct the new file path, replacing .table with .tsv
    new_file_path = os.path.join(new_dir, f"{question_id}_table.tsv")

    try:
        # Read the original .table file
        df = pd.read_csv(data_path, sep='|', quotechar='"', skipinitialspace=True)
        df.columns = [col.strip() for col in df.columns]  # Clean up any leading/trailing spaces in column names

        # Remove unnecessary "Unnamed" columns
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

        # Save the content as a .tsv file with '\t' separator
        df.to_csv(new_file_path, sep='\t', index=False)

        # Set the file to read-only
        os.chmod(new_file_path, stat.S_IREAD)

        return new_file_path
    except Exception as e:
        print(f"Error saving file {new_file_path}: {str(e)}")
        return None


def process_row(row, prefix_path, mode, num_rows=3):
    """
    Process a single row and return a dictionary with the formatted data.
    If there's an error (like reading the .table file), return None to mark it as invalid.
    """
    try:
        # Construct the full data path
        original_data_path = row['context'].replace('.csv', '.table')
        data_path = os.path.join(prefix_path, original_data_path)

        # Save the .table file as a TSV file in the specified mode path and make it read-only
        saved_tsv_path = save_table_as_tsv(data_path, row['id'], prefix_path, mode)
        if not saved_tsv_path:
            return None  # Skip if there's an error saving the file

        # Convert the saved .table to TSV format for data overview
        # data_overview = table_to_tsv(saved_tsv_path)
        # pdb.set_trace()
        data_overview = tsv_to_markdown(saved_tsv_path, num_rows)

        question_data = {
            "question_id": row['id'],
            "question": row['utterance'],
            "data_path": saved_tsv_path,
            "answer": row['targetValue'],
            "data_overview": data_overview
        }
        return question_data
    except Exception as e:
        return None

def sample_valid_data(df, prefix_path, mode, sample_size):
    """
    Sample valid data from the DataFrame until the required sample size is met.
    """
    valid_data = []
    rows = df.to_dict('records')  # Convert DataFrame rows to a list of dictionaries
    random.shuffle(rows)  # Shuffle the rows to sample randomly

    for row in rows:
        processed_data = process_row(row, prefix_path, mode)
        if processed_data:
            valid_data.append(processed_data)
        if len(valid_data) >= sample_size:
            break

    return valid_data

def tsv_to_jsonl(tsv_path, prefix_path, output_jsonl_path, mode, sample_size=None):
    # Read the TSV file
    df = pd.read_csv(tsv_path, sep='\t')

    if sample_size:
        # Get a sample of valid data
        data_list = sample_valid_data(df, prefix_path, mode, sample_size)
    else:
        # Process all rows without sampling
        data_list = [process_row(row, prefix_path, mode) for _, row in df.iterrows() if process_row(row, prefix_path, mode)]

    # Save the list as JSONL
    with open(output_jsonl_path, 'w') as jsonl_file:
        for item in data_list:
            jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"Data saved successfully to {output_jsonl_path}. Total valid entries: {len(data_list)}")

def main():
    parser = argparse.ArgumentParser(description="Convert a TSV file into a JSONL file with specified format, including a data overview in TSV.")
    parser.add_argument('--csv_path', type=str, required=True, help='Path to the input TSV file.')
    parser.add_argument('--prefix_path', type=str, required=True, help='Prefix path to be joined with the context to form data_path.')
    parser.add_argument('--output_jsonl_path', type=str, required=True, help='Path to save the output JSONL file.')
    parser.add_argument('--mode', type=str, required=True, help='Mode to define the folder structure (e.g., test, train).')
    parser.add_argument('--sample_size', type=int, help='Number of valid data samples to return.')
    parser.add_argument('--sample_rows', type=int, help='Number of rowns for each table to return.')

    args = parser.parse_args()

    tsv_to_jsonl(args.csv_path, args.prefix_path, args.output_jsonl_path, args.mode, args.sample_size)

if __name__ == '__main__':
    main()
