import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def format_table_as_text(table_data):
    """Convert table JSON data to formatted text, row by row"""
    text_lines = []
    
    if 'title' in table_data:
        text_lines.append(f"Table: {table_data['title']}")
        text_lines.append("")
    
    if 'header' in table_data and table_data['header']:
        header_row = " | ".join([cell[0] if isinstance(cell, list) and len(cell) > 0 else str(cell) 
                                for cell in table_data['header']])
        text_lines.append(header_row)
        text_lines.append("-" * len(header_row))
    
    if 'data' in table_data and table_data['data']:
        for row in table_data['data']:
            row_text = " | ".join([cell[0] if isinstance(cell, list) and len(cell) > 0 else str(cell) 
                                  for cell in row])
            text_lines.append(row_text)
    
    return "\n".join(text_lines)

def write_table_file(args):
    filename, text, output_dir = args
    file_path = os.path.join(output_dir, filename)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

def extract_tables(input_dir, output_dir):

    tables_tok_path = os.path.join(input_dir, 'tables_tok')
    json_files = [f for f in os.listdir(tables_tok_path) if f.endswith('.json')]

    print(f"Processing {len(json_files)} JSON files in tables_tok...")

    tables = []
    for json_file in tqdm(json_files, desc="Processing files"):
        gt_table = json_file[:-5]
        
        table_file_path = os.path.join(tables_tok_path, json_file)
        
        try:
            with open(table_file_path, 'r') as f:
                table_data = json.load(f)
            
            table_text = format_table_as_text(table_data)
            table_filename = f"{gt_table}.txt"
            tables.append((table_filename, table_text, output_dir))
        
        except Exception as e:
            print(f"Error processing {table_file_path}: {e}")
    
    with ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(write_table_file, tables), total=len(tables), desc="Writing table files"))

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Extract tables from HybridQA dataset.")
    parser.add_argument('--input', type=str, default='WikiTables-WithLinks', help='Input directory containing tables_tok')
    parser.add_argument('--output', type=str, default='tables', help='Output directory for extracted tables')
    args = parser.parse_args()

    input_dir = args.input
    output_dir = args.output
    os.makedirs(output_dir, exist_ok=True)

    extract_tables(input_dir, output_dir)