import json
import argparse
from pathlib import Path
from datetime import datetime
from typing import List, Dict

def find_all_json_files(folder_path: Path) -> List[Path]:
    """
    Find all JSON files in the specified folder
    
    Args:
        folder_path: Folder path
        
    Returns:
        List of all JSON file paths, sorted by filename
    """
    if not folder_path.exists() or not folder_path.is_dir():
        return []
    
    # Find all JSON files
    json_files = list(folder_path.glob("*.json"))
    
    # Sort by filename
    json_files.sort(key=lambda x: x.name)
    return json_files



def merge_openhands_trajectories(base_dir: str, output_dir: str = "data/trajectory_original") -> str:
    """
    Traverse OpenHands output folders, merge the last JSON file from each subfolder into JSONL
    
    Args:
        base_dir: Base directory path
        output_dir: Output directory
        
    Returns:
        Output JSONL file path
    """
    base_path = Path(base_dir)
    
    if not base_path.exists():
        raise FileNotFoundError(f"Base directory does not exist: {base_dir}")
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Generate output filename
    model_name = "deepseek-v3"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = output_path / f"openhands-{model_name}.jsonl"

    merged_count = 0
    error_count = 0
    processed_folders = []
    
    print(f"Starting to traverse directory: {base_dir}")
    
    # Traverse all subfolders
    for subfolder in base_path.iterdir():
        if not subfolder.is_dir():
            continue
            
        print(f"Processing folder: {subfolder.name}")
        
        # Find all JSON files
        json_files = find_all_json_files(subfolder)
        
        if not json_files:
            print(f"  Warning: No JSON files found in folder {subfolder.name}")
            error_count += 1
            continue
        
        # Get the last JSON file as base data
        last_json = json_files[-1]
        
        try:
            # Read data from the last JSON file
            with open(last_json, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
            
            # Add subfolder name as instance_id to JSON data
            json_data['instance_id'] = subfolder.name
            
            # Write to JSONL file
            with open(output_file, 'a', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False)
                f.write('\n')
            
            merged_count += 1
            processed_folders.append({
                "folder_name": subfolder.name,
                "json_files_count": len(json_files),
                "base_json_file": last_json.name,
                "base_json_file_path": str(last_json),
                "all_json_files": [f.name for f in json_files]
            })
            
            print(f"  ✓ Successfully merged: {last_json.name}")
            
        except json.JSONDecodeError as e:
            print(f"  ✗ JSON parsing error {last_json.name}: {e}")
            error_count += 1
        except Exception as e:
            print(f"  ✗ Error processing file {last_json.name}: {e}")
            error_count += 1
    
    # Generate processing report
    report = {
        "timestamp": datetime.now().isoformat(),
        "base_directory": str(base_path),
        "output_file": str(output_file),
        "total_folders_processed": len(list(base_path.iterdir())),
        "successful_merges": merged_count,
        "errors": error_count,
        "processed_folders": processed_folders
    }
    
    # Save processing report
    report_file = f"log/merge_report_{timestamp}.json"
    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=4, ensure_ascii=False)
    
    print(f"\n=== Merge Completed ===")
    print(f"Successfully merged: {merged_count} files")
    print(f"Error count: {error_count}")
    print(f"Output file: {output_file}")
    print(f"Processing report: {report_file}")
    
    return str(output_file)

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Merge OpenHands trajectory files to JSONL format')
    parser.add_argument(
        '--base-directory', 
        type=str,
        default="OpenHands/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent/deepseek-v3_maxiter_30_N_v0.42.0-no-hint-run_1/llm_completions",
        help='Target directory path (default: OpenHands evaluation output directory)'
    )
    parser.add_argument(
        '--output-dir',
        type=str,
        default="data/trajectory_original",
        help='Output directory path (default: data/trajectory_original)'
    )
    
    args = parser.parse_args()
    base_directory = args.base_directory
    output_dir = args.output_dir
    
    try:
        # Execute merge
        output_file = merge_openhands_trajectories(base_directory, output_dir)
        print(f"\nMerge operation completed! Output file: {output_file}")
        
    except Exception as e:
        print(f"Error occurred during execution: {e}")

if __name__ == "__main__":
    main()