import os
import json
import argparse
from tqdm import tqdm

def check_empty_categories(dataset_dir):
    results = {}
    dataset_folders = [f for f in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, f))]
    
    print(f"Found {len(dataset_folders)} dataset folders")
    
    for dataset_name in dataset_folders:
        meta_file = os.path.join(dataset_dir, dataset_name, f"meta_{dataset_name}.jsonl")
        
        if not os.path.exists(meta_file):
            continue
        
        print(f"Processing {meta_file}...")
        
        total_lines = 0
        empty_categories = 0
        
        # Count lines in file first
        with open(meta_file, 'r', encoding='utf-8') as f:
            for _ in f:
                total_lines += 1
        
        # Process file with progress bar
        with open(meta_file, 'r', encoding='utf-8') as f:
            for line in tqdm(f, total=total_lines, desc="Processing lines"):
                try:
                    item = json.loads(line.strip())
                    if "categories" not in item or not item["categories"]:
                        empty_categories += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in {meta_file} at line {total_lines}")
        
        results[dataset_name] = {
            "total_lines": total_lines,
            "empty_categories": empty_categories,
            "percentage": (empty_categories / total_lines * 100) if total_lines > 0 else 0
        }
        
        print(f"Results for {dataset_name}:")
        print(f"  Total lines: {total_lines}")
        print(f"  Empty categories: {empty_categories}")
        print(f"  Percentage: {results[dataset_name]['percentage']:.2f}%")
    
    return results

def save_results(results, output_file):
    """Save results to a JSON file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

def main():
    parser = argparse.ArgumentParser(description='Check for empty categories in meta files')
    parser.add_argument('--dataset_dir', type=str, default='/home/yqiao47/dataset',
                        help='Directory containing dataset folders (default: /home/yqiao47/dataset)')
    parser.add_argument('--output', type=str, default='empty_categories_report.json',
                        help='Output file for results (default: empty_categories_report.json)')
    
    args = parser.parse_args()
    
    print(f"Checking for empty categories in {args.dataset_dir}")
    results = check_empty_categories(args.dataset_dir)
    
    print("\n=== Summary ===")
    for dataset, data in results.items():
        print(f"{dataset}: {data['empty_categories']}/{data['total_lines']} empty categories ({data['percentage']:.2f}%)")
    
    save_results(results, args.output)

if __name__ == "__main__":
    main()