import json
import os
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description="Select Top-N tokens from scored mining results.")
    parser.add_argument("--input_file", type=str, required=True, 
                        help="Path to the scored tokens file")
    parser.add_argument("--output_file", type=str, required=True, 
                        help="Path to save the selected final token list (.json)")
    parser.add_argument("--top_n", type=int, default=500, 
                        help="Number of tokens to select (-1 for all)")
    return parser.parse_args()

def main():
    args = parse_args()
    
    print(f"\n=== Starting Token Selection ===")
    print(f"Input: {os.path.basename(args.input_file)}")

    # 1. Load Data
    if not os.path.exists(args.input_file):
        print(f"❌ Error: Input file not found: {args.input_file}")
        return

    with open(args.input_file, 'r', encoding='utf-8') as f:
        scored_tokens = json.load(f)
    
    total_found = len(scored_tokens)
    print(f"Loaded {total_found} candidates.")

    # 2. Sort by Savings (Ensure descending order)
    scored_tokens.sort(key=lambda x: x.get("total_savings", 0), reverse=True)

    # 3. Slice Top-N with Explicit Safety Check
    if args.top_n == -1:
        # Case A: User asked for ALL
        selected_data = scored_tokens
        selection_desc = f"ALL ({total_found})"
    
    elif args.top_n >= total_found:
        # Case B: User asked for MORE than available (Safety Redundancy)
        selected_data = scored_tokens
        selection_desc = f"ALL ({total_found})"
        print(f"⚠️  Note: Requested Top {args.top_n}, but only found {total_found}. Selecting all available.")
        
    else:
        # Case C: Standard selection
        selected_data = scored_tokens[:args.top_n]
        selection_desc = f"Top {args.top_n}"

    # 4. Extract Token Strings
    final_tokens = [item['token'] for item in selected_data]

    # 5. Save
    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
    with open(args.output_file, 'w', encoding='utf-8') as f:
        json.dump(final_tokens, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Selection Complete: {selection_desc}")
    print(f"Saved to: {args.output_file}")
    
    # 6. Preview
    print("\n🔍 Sample Preview (First 5):")
    print("-" * 40)
    for i, token in enumerate(final_tokens[:5]):
        print(f"  {i+1}: {repr(token)}")
    print("-" * 40)

if __name__ == "__main__":
    main()