#!/usr/bin/env python3
"""
增强的RAM++标签映射分析
专门优化diningtable和sofa等复杂类别的映射
"""

import os
import json
from collections import defaultdict

def load_ram_tags():
    """加载RAM++的完整标签列表"""
    tag_file = '/home/gyf/iclr/recognize-anything/ram/data/ram_tag_list.txt'
    
    with open(tag_file, 'r', encoding='utf-8') as f:
        tags = [line.strip().lower() for line in f.readlines()]
    
    return tags

def find_related_tags(target_words, ram_tags, max_results=20):
    """找到与目标词相关的所有RAM++标签"""
    related = []
    
    for word in target_words:
        word_lower = word.lower()
        for tag in ram_tags:
            if word_lower in tag:
                related.append(tag)
    
    # 去重并排序
    return sorted(list(set(related)))[:max_results]

def create_enhanced_voc_mapping():
    """创建增强的VOC到RAM++标签映射"""
    
    # 加载RAM++标签
    ram_tags = load_ram_tags()
    ram_tags_set = set(ram_tags)
    
    print(f"📊 RAM++标签总数: {len(ram_tags)}")
    print(f"📂 标签文件位置: /home/gyf/iclr/recognize-anything/ram/data/ram_tag_list.txt")
    print("=" * 80)
    
    # VOC类别
    voc_classes = [
        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
        'bus', 'car', 'cat', 'chair', 'cow',
        'diningtable', 'dog', 'horse', 'motorbike', 'person',
        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
    ]
    
    # 增强的映射策略
    enhanced_mapping = {}
    
    for voc_class in voc_classes:
        print(f"\n🔍 分析VOC类别: '{voc_class}'")
        
        matches = []
        
        # 1. 精确匹配
        if voc_class.lower() in ram_tags_set:
            matches.append(('exact', voc_class.lower()))
            print(f"  ✅ 精确匹配: {voc_class.lower()}")
        
        # 2. 增强的同义词映射
        enhanced_synonyms = {
            'aeroplane': ['airplane', 'aircraft', 'plane', 'airliner', 'jet'],
            'bicycle': ['bike', 'cycle'],
            'diningtable': [
                'dining table', 'table', 'dinning table',  # 注意typo
                'kitchen table', 'dining room table', 'dinner table'
            ],
            'motorbike': ['motorcycle', 'motor bike', 'bike'],
            'pottedplant': ['potted plant', 'plant', 'houseplant', 'indoor plant'],
            'sofa': ['couch', 'loveseat', 'settee', 'divan'],
            'tvmonitor': ['tv', 'television', 'monitor', 'screen', 'display']
        }
        
        if voc_class in enhanced_synonyms:
            print(f"  🔄 检查增强同义词: {enhanced_synonyms[voc_class]}")
            for syn in enhanced_synonyms[voc_class]:
                if syn in ram_tags_set:
                    matches.append(('synonym', syn))
                    print(f"    ✅ 找到同义词: {syn}")
        
        # 3. 语义相关词搜索
        semantic_keywords = {
            'diningtable': ['table', 'dining', 'kitchen', 'furniture'],
            'sofa': ['couch', 'seat', 'furniture', 'living'],
            'chair': ['chair', 'seat', 'furniture'],
            'bottle': ['bottle', 'container'],
            'boat': ['boat', 'ship', 'vessel', 'yacht'],
            'cow': ['cow', 'cattle', 'bull'],
            'sheep': ['sheep', 'lamb'],
            'horse': ['horse', 'pony', 'stallion', 'mare']
        }
        
        if voc_class in semantic_keywords:
            print(f"  🧠 搜索语义相关标签...")
            related_tags = find_related_tags(semantic_keywords[voc_class], ram_tags, max_results=10)
            
            for tag in related_tags:
                if tag not in [m[1] for m in matches]:  # 避免重复
                    matches.append(('semantic', tag))
                    print(f"    📝 语义相关: {tag}")
        
        enhanced_mapping[voc_class] = matches
        print(f"  ✅ 总计找到 {len(matches)} 个匹配")
    
    return enhanced_mapping, ram_tags

def analyze_specific_classes():
    """专门分析diningtable和sofa类别"""
    print("\n" + "=" * 80)
    print("🎯 专门分析 diningtable 和 sofa 类别")
    print("=" * 80)
    
    ram_tags = load_ram_tags()
    
    # 分析 diningtable
    print(f"\n🍽️  DININGTABLE 相关标签分析:")
    dining_keywords = ['table', 'dining', 'kitchen', 'furniture', 'desk']
    dining_related = find_related_tags(dining_keywords, ram_tags, max_results=15)
    
    print(f"  找到 {len(dining_related)} 个相关标签:")
    for tag in dining_related:
        print(f"    📝 {tag}")
    
    # 分析 sofa
    print(f"\n🛋️  SOFA 相关标签分析:")
    sofa_keywords = ['sofa', 'couch', 'seat', 'furniture', 'living']
    sofa_related = find_related_tags(sofa_keywords, ram_tags, max_results=15)
    
    print(f"  找到 {len(sofa_related)} 个相关标签:")
    for tag in sofa_related:
        print(f"    📝 {tag}")
    
    return dining_related, sofa_related

def generate_improved_mapping_code():
    """生成改进后的映射代码"""
    print("\n" + "=" * 80)
    print("🔧 生成改进后的映射代码")
    print("=" * 80)
    
    mapping_code = '''
def create_improved_voc_mapping(ram_tags_set):
    """改进后的VOC到RAM++标签映射"""
    
    voc_to_ram_mapping = {}
    
    # 基础映射
    basic_mapping = {
        'aeroplane': ['plane', 'airplane', 'aircraft', 'airliner'],
        'bicycle': ['bicycle', 'bike', 'cycle'],
        'bird': ['bird'],
        'boat': ['boat', 'ship', 'vessel', 'yacht'],
        'bottle': ['bottle', 'wine bottle', 'beer bottle', 'water bottle', 'glass bottle'],
        'bus': ['bus'],
        'car': ['car'],
        'cat': ['cat'],
        'chair': ['chair', 'armchair', 'rocking chair', 'office chair'],
        'cow': ['cow', 'cattle', 'bull'],
        'diningtable': [
            'table', 'dining table', 'dinning table',  # 包含typo版本
            'kitchen table', 'dining room table', 'furniture'
        ],
        'dog': ['dog'],
        'horse': ['horse', 'pony', 'stallion', 'mare'],
        'motorbike': ['motorbike', 'motorcycle'],
        'person': ['person', 'man', 'woman', 'people'],
        'pottedplant': ['plant', 'potted plant', 'houseplant'],
        'sheep': ['sheep', 'lamb'],
        'sofa': ['couch', 'loveseat', 'sofa'],  # 注意：RAM++中sofa可能不存在
        'train': ['train', 'locomotive', 'bullet train', 'passenger train'],
        'tvmonitor': ['television', 'tv', 'monitor', 'screen']
    }
    
    for voc_class, candidates in basic_mapping.items():
        matches = []
        for candidate in candidates:
            if candidate in ram_tags_set:
                matches.append(candidate)
        voc_to_ram_mapping[voc_class] = matches
    
    return voc_to_ram_mapping
'''
    
    print(mapping_code)
    
    # 保存到文件
    with open('/home/gyf/iclr/recognize-anything/cltag/improved_mapping.py', 'w') as f:
        f.write(mapping_code)
    
    print("✅ 改进后的映射代码已保存到: improved_mapping.py")

def save_full_tag_analysis():
    """保存完整的标签分析到JSON文件"""
    enhanced_mapping, ram_tags = create_enhanced_voc_mapping()
    
    # 准备保存的数据
    analysis_data = {
        'ram_tags_total': len(ram_tags),
        'ram_tags_file': '/home/gyf/iclr/recognize-anything/ram/data/ram_tag_list.txt',
        'voc_to_ram_mapping': enhanced_mapping,
        'ram_tags_sample': ram_tags[:100],  # 保存前100个标签作为样例
        'analysis_timestamp': '2025-09-21'
    }
    
    output_file = '/home/gyf/iclr/recognize-anything/cltag/ram_tag_analysis.json'
    with open(output_file, 'w') as f:
        json.dump(analysis_data, f, indent=2, ensure_ascii=False)
    
    print(f"✅ 完整标签分析已保存到: {output_file}")

def main():
    print("🏷️  增强的RAM++标签映射分析")
    print("=" * 80)
    
    try:
        # 1. 创建增强映射
        enhanced_mapping, ram_tags = create_enhanced_voc_mapping()
        
        # 2. 专门分析复杂类别
        dining_tags, sofa_tags = analyze_specific_classes()
        
        # 3. 生成改进代码
        generate_improved_mapping_code()
        
        # 4. 保存分析结果
        save_full_tag_analysis()
        
        # 5. 总结重要发现
        print("\n" + "=" * 80)
        print("📋 重要发现总结")
        print("=" * 80)
        
        print(f"\n🍽️  DININGTABLE 映射:")
        print(f"  - 精确匹配: table")
        print(f"  - 同义词: dinning table (注意拼写错误)")
        print(f"  - 相关标签: {len(dining_tags)} 个")
        print(f"  - 建议主要使用: table, dinning table, kitchen table")
        
        print(f"\n🛋️  SOFA 映射:")
        print(f"  - 精确匹配: 无 (sofa不在RAM++标签中)")
        print(f"  - 同义词: couch, loveseat")
        print(f"  - 相关标签: {len(sofa_tags)} 个")
        print(f"  - 建议主要使用: couch, loveseat")
        
        print(f"\n📍 RAM++标签文件位置:")
        print(f"  /home/gyf/iclr/recognize-anything/ram/data/ram_tag_list.txt")
        print(f"  总计 {len(ram_tags)} 个标签")
        
    except Exception as e:
        print(f"❌ 错误: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()