#!/usr/bin/env python3
"""
Dermatological Vocabulary Collector
Specialized in collecting vocabulary for dermatological image categories, with output format fully matching grounding_dino_medical_enhanced.py
"""

import os
import json
import re
from collections import Counter, defaultdict
from pathlib import Path

# Set NLTK data path
nltk_data_path = "/root/autodl-tmp/nltk_data"
if os.path.exists(nltk_data_path):
    import nltk

    nltk.data.path.append(nltk_data_path)
    print(f"Using local NLTK data: {nltk_data_path}")
else:
    import nltk

    print("Using default NLTK data path")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize


def load_jsonl(file_path):
    """Load JSONL file"""
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            for line in f:
                if line.strip():
                    data.append(json.loads(line.strip()))
    except Exception as e:
        print(f"Failed to load file {file_path}: {e}")
    return data


class DermatologicalVocabularyCollector:
    """Dermatological Image Vocabulary Collector"""

    def __init__(self):
        # Predefined vocabulary
        self.setup_predefined_vocabularies()

        # Get stop words
        try:
            self.stop_words = set(stopwords.words('english'))
        except:
            self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}

        self.medical_stop_words = {
            'patient', 'study', 'examination', 'exam', 'scan', 'image', 'imaging', 'shows', 'demonstrates',
            'reveals', 'appears', 'seen', 'noted', 'identified', 'findings', 'finding', 'report',
            'impression', 'conclusion', 'comparison', 'compared', 'previous', 'prior', 'technique'
        }

        self.all_stop_words = self.stop_words.union(self.medical_stop_words)

    def setup_predefined_vocabularies(self):
        """Set up predefined dermatological image vocabularies"""

        # Adjectives indicating normal/no abnormality
        self.normal_adjectives = {
            'normal', 'healthy', 'intact', 'clear', 'smooth', 'even', 'uniform',
            'regular', 'symmetric', 'symmetrical', 'appropriate', 'adequate',
            'satisfactory', 'good', 'excellent', 'optimal', 'stable', 'unchanged',
            'unremarkable', 'benign', 'physiologic', 'physiological', 'typical',
            'routine', 'baseline', 'well-demarcated', 'well-defined', 'sharp'
        }

        # Adjectives indicating abnormality
        self.abnormal_adjectives = {
            # Color characteristics
            'pigmented', 'hyperpigmented', 'hypopigmented', 'depigmented',
            'melanotic', 'amelanotic', 'erythematous', 'violaceous', 'cyanotic',
            'yellow', 'brown', 'black', 'blue', 'red', 'pink', 'white', 'gray',
            'tan', 'bronze', 'golden', 'copper', 'silver', 'pearl', 'flesh-colored',

            # Morphological characteristics
            'raised', 'elevated', 'flat', 'depressed', 'umbilicated', 'dome-shaped',
            'pedunculated', 'sessile', 'papular', 'nodular', 'plaque-like',
            'vesicular', 'bullous', 'pustular', 'cystic', 'solid', 'fluid-filled',
            'keratotic', 'hyperkeratotic', 'scaling', 'crusted', 'ulcerated',
            'erosive', 'verrucous', 'warty', 'lobulated', 'multilobulated',

            # Border characteristics
            'well-defined', 'well-demarcated', 'well-circumscribed', 'ill-defined',
            'poorly-defined', 'irregular', 'asymmetric', 'scalloped', 'notched',
            'sharp', 'blurred', 'fuzzy', 'indistinct', 'geographic', 'serpiginous',

            # Surface characteristics
            'smooth', 'rough', 'irregular', 'granular', 'papillomatous', 'verrucous',
            'hyperkeratotic', 'scaling', 'flaky', 'crusty', 'moist', 'dry',
            'shiny', 'dull', 'glossy', 'matte', 'translucent', 'opaque',

            # Consistency and hardness
            'soft', 'firm', 'hard', 'indurated', 'fluctuant', 'compressible',
            'non-compressible', 'mobile', 'fixed', 'adherent', 'tethered',

            # Distribution patterns
            'localized', 'generalized', 'widespread', 'diffuse', 'scattered',
            'clustered', 'grouped', 'linear', 'annular', 'targetoid', 'reticular',
            'bilateral', 'unilateral', 'symmetric', 'asymmetric', 'dermatomal',
            'zosteriform', 'herpetiform', 'serpiginous', 'arcuate',

            # Size and quantity
            'small', 'large', 'tiny', 'huge', 'giant', 'multiple', 'numerous',
            'few', 'single', 'solitary', 'isolated', 'coalescent', 'confluent',
            'discrete', 'extensive', 'minimal', 'moderate', 'marked', 'severe',

            # Inflammatory characteristics
            'inflammatory', 'inflamed', 'irritated', 'infected', 'suppurative',
            'purulent', 'serous', 'hemorrhagic', 'necrotic', 'gangrenous',
            'ischemic', 'edematous', 'swollen', 'tender', 'painful', 'pruritic',

            # Vascular characteristics
            'vascular', 'hypervascular', 'avascular', 'telangiectatic', 'blanching',
            'non-blanching', 'pulsatile', 'thrombosed', 'varicose', 'spider',

            # Pathological status
            'malignant', 'benign', 'premalignant', 'dysplastic', 'atypical',
            'suspicious', 'concerning', 'worrisome', 'invasive', 'metastatic',
            'primary', 'secondary', 'recurrent', 'persistent', 'progressive',
            'stable', 'regressing', 'resolving', 'healing', 'chronic', 'acute'
        }

        # Dermatology-specific directional terms
        self.direction_terms = {
            # Body parts
            'head', 'neck', 'face', 'scalp', 'forehead', 'temple', 'cheek', 'chin',
            'nose', 'ear', 'eyelid', 'lip', 'mouth', 'jaw', 'throat',

            # Trunk
            'chest', 'breast', 'back', 'abdomen', 'flank', 'waist', 'hip',
            'buttock', 'groin', 'axilla', 'armpit', 'shoulder', 'scapula',

            # Extremities
            'arm', 'forearm', 'elbow', 'wrist', 'hand', 'finger', 'thumb',
            'leg', 'thigh', 'knee', 'shin', 'calf', 'ankle', 'foot', 'toe',
            'palm', 'dorsum', 'plantar', 'sole', 'heel',

            # Genital organs
            'genital', 'penis', 'scrotum', 'vulva', 'vagina', 'perineum',
            'perianal', 'anal', 'inguinal',

            # Basic directions
            'left', 'right', 'bilateral', 'unilateral', 'medial', 'lateral',
            'proximal', 'distal', 'anterior', 'posterior', 'superior', 'inferior',
            'central', 'peripheral', 'dorsal', 'ventral', 'radial', 'ulnar',

            # Skin-specific locations
            'dermal', 'epidermal', 'subepidermal', 'subcutaneous', 'deep',
            'superficial', 'intradermal', 'transdermal', 'subdermal',
            'intraepidermal', 'junctional', 'papillary', 'reticular',

            # Distribution descriptions
            'localized', 'generalized', 'regional', 'segmental', 'linear',
            'blaschkoid', 'dermatomal', 'photodistributed', 'acral', 'centripetal',
            'centrifugal', 'ascending', 'descending', 'satellite', 'perilesional'
        }

        # Dermatological anatomical nouns and lesion types
        self.anatomical_nouns = {
            # Skin structures
            'skin', 'epidermis', 'dermis', 'subcutis', 'hypodermis', 'subcutaneous',
            'tissue', 'layer', 'stratum', 'corneum', 'basale', 'spinosum', 'granulosum',
            'basement', 'membrane', 'papilla', 'rete', 'ridge', 'peg',

            # Skin appendages
            'hair', 'follicle', 'sebaceous', 'gland', 'sweat', 'gland', 'eccrine',
            'apocrine', 'nail', 'matrix', 'bed', 'plate', 'cuticle', 'lunula',

            # Cell types
            'keratinocyte', 'melanocyte', 'langerhans', 'cell', 'merkel', 'cell',
            'fibroblast', 'histiocyte', 'macrophage', 'lymphocyte', 'neutrophil',
            'eosinophil', 'mast', 'cell', 'plasma', 'cell',

            # Basic lesion types
            'lesion', 'lesions', 'macule', 'patch', 'papule', 'nodule', 'plaque',
            'wheal', 'vesicle', 'bulla', 'pustule', 'cyst', 'tumor', 'growth',
            'mass', 'swelling', 'thickening', 'induration', 'infiltration',

            # Surface changes
            'scale', 'scales', 'crust', 'crusts', 'erosion', 'ulcer', 'ulceration',
            'fissure', 'excoriation', 'lichenification', 'atrophy', 'sclerosis',
            'keratosis', 'hyperkeratosis', 'parakeratosis', 'dyskeratosis',

            # Pigment lesions
            'nevus', 'nevi', 'mole', 'melanoma', 'melanocytic', 'lentigo', 'lentigines',
            'cafe-au-lait', 'macule', 'vitiligo', 'albinism', 'melasma', 'chloasma',
            'hyperpigmentation', 'hypopigmentation', 'depigmentation', 'pigmentation',

            # Vascular lesions
            'hemangioma', 'angioma', 'telangiectasia', 'spider', 'angioma', 'petechiae',
            'purpura', 'ecchymosis', 'hematoma', 'thrombosis', 'embolism', 'infarct',
            'vasculitis', 'livedo', 'reticularis', 'erythema', 'flush', 'blush',

            # Infectious diseases
            'cellulitis', 'abscess', 'furuncle', 'carbuncle', 'folliculitis',
            'impetigo', 'erysipelas', 'necrotizing', 'fasciitis', 'gangrene',
            'herpes', 'simplex', 'zoster', 'varicella', 'molluscum', 'contagiosum',
            'wart', 'condyloma', 'verruca', 'tinea', 'candidiasis', 'onychomycosis',

            # Inflammatory diseases
            'dermatitis', 'eczema', 'psoriasis', 'lichen', 'planus', 'sclerosus',
            'scleroderma', 'morphea', 'lupus', 'erythematosus', 'dermatomyositis',
            'vasculitis', 'urticaria', 'angioedema', 'erythema', 'multiforme',
            'nodosum', 'migrans', 'annulare', 'chronicum', 'migrans',

            # Benign tumors
            'fibroma', 'lipoma', 'neurofibroma', 'schwannoma', 'leiomyoma',
            'pilomatrixoma', 'trichoepithelioma', 'syringoma', 'hidradenoma',
            'spiradenoma', 'cylindroma', 'dermatofibroma', 'keloid', 'hypertrophic',
            'scar', 'acrochordon', 'skin', 'tag', 'seborrheic', 'keratosis',

            # Malignant tumors
            'melanoma', 'carcinoma', 'basal', 'cell', 'squamous', 'cell', 'merkel',
            'cell', 'sarcoma', 'lymphoma', 'mycosis', 'fungoides', 'kaposi',
            'sarcoma', 'angiosarcoma', 'leiomyosarcoma', 'fibrosarcoma',
            'dermatofibrosarcoma', 'protuberans', 'metastasis', 'metastases',

            # Precancerous lesions
            'actinic', 'keratosis', 'bowen', 'disease', 'erythroplasia', 'queyrat',
            'leukoplakia', 'dysplasia', 'carcinoma', 'situ', 'melanoma', 'situ',

            # Autoimmune diseases
            'pemphigus', 'pemphigoid', 'dermatitis', 'herpetiformis', 'linear',
            'iga', 'bullous', 'disease', 'epidermolysis', 'bullosa', 'porphyria',
            'cutanea', 'tarda', 'pseudoporphyria',

            # Hereditary diseases
            'ichthyosis', 'xeroderma', 'pigmentosum', 'tuberous', 'sclerosis',
            'neurofibromatosis', 'von', 'recklinghausen', 'gorlin', 'syndrome',
            'basal', 'cell', 'nevus', 'syndrome', 'cowden', 'syndrome',

            # Drug reactions
            'drug', 'eruption', 'fixed', 'drug', 'eruption', 'stevens-johnson',
            'syndrome', 'toxic', 'epidermal', 'necrolysis', 'ten', 'dress',
            'syndrome', 'photosensitivity', 'phototoxicity', 'photoallergy',

            # Trauma and repair
            'wound', 'laceration', 'abrasion', 'contusion', 'burn', 'frostbite',
            'pressure', 'ulcer', 'decubitus', 'ulcer', 'venous', 'ulcer',
            'arterial', 'ulcer', 'diabetic', 'ulcer', 'healing', 'repair',
            'regeneration', 'granulation', 'tissue', 'epithelialization',

            # Cosmetic-related
            'wrinkle', 'rhytid', 'photoaging', 'solar', 'elastosis', 'acne',
            'rosacea', 'melasma', 'age', 'spot', 'freckle', 'ephelis',
            'solar', 'lentigo', 'sebaceous', 'hyperplasia', 'cherry', 'angioma',

            # Imaging features
            'pattern', 'texture', 'architecture', 'morphology', 'echogenicity',
            'vascularity', 'perfusion', 'enhancement', 'signal', 'intensity',
            'contrast', 'uptake', 'distribution', 'symmetry', 'border', 'margin',

            # Measurement assessment
            'size', 'diameter', 'thickness', 'depth', 'width', 'length', 'area',
            'volume', 'measurement', 'dimension', 'breslow', 'thickness', 'clark',
            'level', 'mitotic', 'rate', 'ulceration', 'regression'
        }

        # Dermatology-specific abbreviations
        self.dermatological_abbreviations = {
            # Diseases
            'bcc', 'scc', 'mm', 'ak', 'sk', 'df', 'kp', 'lp', 'le', 'dm',
            'ss', 'dcis', 'lcis', 'dfsp', 'mf', 'ks', 'nf', 'ts', 'xp',

            # Examinations
            'dermoscopy', 'rct', 'oct', 'us', 'mri', 'ct', 'pet', 'spect',
            'confocal', 'microscopy', 'biopsy', 'fna', 'shave', 'punch', 'excision',

            # Treatments
            'pdt', 'laser', 'ipl', 'rf', 'cryo', 'electro', 'moh', 'surgery',
            'curettage', 'cautery', 'diathermy', 'topical', 'intralesional',

            # Classifications
            'who', 'tnm', 'ajcc', 'clark', 'breslow', 'figo', 'iss', 'abcd',
            'abcde', 'ugly', 'duckling', 'glasgow', 'seven', 'point', 'checklist',

            # Histology
            'he', 'pas', 'gram', 'afb', 'gms', 'orcein', 'verhoeff', 'mason',
            'trichrome', 'congo', 'red', 'crystal', 'violet', 'toluidine', 'blue',

            # Immunohistochemistry
            'ihc', 'if', 'dif', 'iif', 'elisa', 'ifa', 'pcr', 'fish', 'ish',
            'cd', 'marker', 's100', 'hmb45', 'melan', 'cytokeratin', 'ck',

            # Unit measurements
            'mm', 'cm', 'nm', 'um', 'mg', 'ml', 'spf', 'uv', 'uvb', 'uva',
            'mj', 'med', 'jcm2', 'ppm', 'percent'
        }

    def extract_from_text(self, text):
        """Extract various types of vocabulary from text"""
        text_lower = text.lower()

        # Preprocessing: retain hyphens, remove other punctuation
        clean_text = re.sub(r'[^\w\s\-]', ' ', text_lower)
        words = clean_text.split()

        # Extraction results
        found_normal_adj = set()
        found_abnormal_adj = set()
        found_directions = set()
        found_nouns = set()
        found_abbreviations = set()

        # Word-level extraction
        for word in words:
            word = word.strip('-')  # Remove leading and trailing hyphens
            if len(word) < 2 or word in self.all_stop_words:
                continue

            if word in self.normal_adjectives:
                found_normal_adj.add(word)
            elif word in self.abnormal_adjectives:
                found_abnormal_adj.add(word)
            elif word in self.direction_terms:
                found_directions.add(word)
            elif word in self.anatomical_nouns:
                found_nouns.add(word)
            elif word in self.dermatological_abbreviations:
                found_abbreviations.add(word)

        # Multi-word combination extraction
        text_phrases = self.extract_phrase_patterns(text_lower)
        for phrase in text_phrases:
            if phrase in self.direction_terms:
                found_directions.add(phrase)
            elif phrase in self.anatomical_nouns:
                found_nouns.add(phrase)

        return {
            'normal_adjectives': found_normal_adj,
            'abnormal_adjectives': found_abnormal_adj,
            'direction_terms': found_directions,
            'anatomical_nouns': found_nouns,
            'abbreviations': found_abbreviations
        }

    def extract_phrase_patterns(self, text):
        """Extract phrase patterns"""
        phrases = set()

        # Hyphenated combinations
        hyphen_phrases = re.findall(r'\b\w+(?:-\w+)+\b', text)
        phrases.update(hyphen_phrases)

        # Special dermatological term combinations
        special_patterns = [
            r'basal\s+cell\s+carcinoma',
            r'squamous\s+cell\s+carcinoma',
            r'malignant\s+melanoma',
            r'actinic\s+keratosis',
            r'seborrheic\s+keratosis',
            r'atypical\s+nevus',
            r'cafe-au-lait\s+macule',
            r'cherry\s+angioma',
            r'spider\s+angioma',
            r'skin\s+tag',
            r'solar\s+lentigo',
            r'age\s+spot',
            r'liver\s+spot',
            r'melanocytic\s+nevus',
            r'compound\s+nevus',
            r'junctional\s+nevus',
            r'intradermal\s+nevus',
            r'dermal\s+nevus',
            r'blue\s+nevus',
            r'spitz\s+nevus',
            r'halo\s+nevus',
            r'congenital\s+nevus',
            r'dysplastic\s+nevus',
            r'atypical\s+mole',
            r'sebaceous\s+gland',
            r'hair\s+follicle',
            r'sweat\s+gland',
            r'nail\s+bed',
            r'nail\s+matrix'
        ]

        for pattern in special_patterns:
            matches = re.findall(pattern, text)
            phrases.update(matches)

        # Two-word combinations
        words = text.split()
        for i in range(len(words) - 1):
            phrase = f"{words[i]} {words[i + 1]}"
            # Only add combinations present in predefined vocabularies
            if phrase in self.direction_terms or phrase in self.anatomical_nouns:
                phrases.add(phrase)

        return phrases

    def extract_no_expressions(self, text):
        """Extract dermatology-related 'no' expressions"""
        text_lower = text.lower()
        no_expressions = set()

        # Basic 'no' patterns
        basic_no_patterns = [
            r'no\s+evidence\s+of\s+[\w\s]+',
            r'no\s+signs\s+of\s+[\w\s]+',
            r'no\s+obvious\s+[\w\s]+',
            r'no\s+visible\s+[\w\s]+',
            r'no\s+significant\s+[\w\s]+',
            r'no\s+abnormal\s+[\w\s]+',
            r'no\s+suspicious\s+[\w\s]+',
            r'no\s+malignant\s+[\w\s]+',
            r'no\s+pigmented\s+[\w\s]+',
            r'no\s+atypical\s+[\w\s]+',
        ]

        for pattern in basic_no_patterns:
            matches = re.findall(pattern, text_lower)
            for match in matches:
                # Limit length, take first 6 words
                words = match.split()
                if len(words) <= 6:
                    no_expressions.add(match.strip())
                else:
                    no_expressions.add(' '.join(words[:6]))

        # Dermatology-specific 'no' expressions
        specific_no_patterns = [
            'no evidence', 'no signs', 'no obvious', 'no visible', 'no apparent',
            'no significant', 'no abnormal', 'no suspicious', 'no concerning',
            'no worrisome', 'no definite', 'no clear', 'no atypical',

            # Malignancy-related
            'no malignancy', 'no malignant lesion', 'no malignant features',
            'no melanoma', 'no carcinoma', 'no cancer', 'no tumor',
            'no neoplasm', 'no suspicious lesion', 'no atypical lesion',
            'no malignant transformation', 'no dysplasia', 'no atypia',

            # Pigment lesions
            'no pigmented lesion', 'no melanocytic lesion', 'no atypical nevus',
            'no dysplastic nevus', 'no suspicious nevus', 'no changing nevus',
            'no new nevus', 'no evolving lesion', 'no irregular pigmentation',

            # Inflammation and infection
            'no inflammation', 'no infection', 'no cellulitis', 'no abscess',
            'no erythema', 'no swelling', 'no edema', 'no purulence',
            'no suppuration', 'no ulceration', 'no necrosis', 'no gangrene',

            # Vascular lesions
            'no vascular lesion', 'no hemangioma', 'no angioma', 'no telangiectasia',
            'no purpura', 'no petechiae', 'no ecchymosis', 'no hematoma',
            'no thrombosis', 'no vasculitis', 'no livedo', 'no blanching',

            # Allergic reactions
            'no allergic reaction', 'no drug reaction', 'no contact dermatitis',
            'no eczema', 'no dermatitis', 'no urticaria', 'no angioedema',
            'no hypersensitivity', 'no irritation', 'no sensitization',

            # Infectious diseases
            'no fungal infection', 'no bacterial infection', 'no viral infection',
            'no tinea', 'no candidiasis', 'no herpes', 'no warts', 'no molluscum',
            'no impetigo', 'no folliculitis', 'no furuncle', 'no carbuncle',

            # Autoimmune diseases
            'no autoimmune disease', 'no lupus', 'no dermatomyositis',
            'no scleroderma', 'no morphea', 'no lichen planus', 'no pemphigus',
            'no pemphigoid', 'no vasculitis', 'no connective tissue disease',

            # Surface changes
            'no scaling', 'no crusting', 'no erosion', 'no ulceration',
            'no excoriation', 'no lichenification', 'no atrophy', 'no sclerosis',
            'no hyperkeratosis', 'no parakeratosis', 'no acantholysis',

            # Distribution patterns
            'no symmetric distribution', 'no bilateral distribution',
            'no dermatomal distribution', 'no linear distribution',
            'no photodistribution', 'no satellite lesions', 'no koebner phenomenon',

            # Special signs
            'no koebner phenomenon', 'no nikolsky sign', 'no auspitz sign',
            'no darier sign', 'no wickham striae', 'no wickham papules',
            'no herald patch', 'no target lesions', 'no bull\'s eye lesions',

            # Functional abnormalities
            'no pruritus', 'no pain', 'no burning', 'no stinging', 'no numbness',
            'no tingling', 'no tenderness', 'no hyperesthesia', 'no hypoesthesia',
            'no anesthesia', 'no paresthesia',

            # General expressions
            'no skin lesion', 'no skin abnormality', 'no cutaneous lesion',
            'no dermatologic abnormality', 'no pathologic changes',
            'no significant findings', 'no remarkable findings',
            'no active disease', 'no acute changes', 'no interval changes'
        ]

        for pattern in specific_no_patterns:
            if pattern in text_lower:
                no_expressions.add(pattern)

        return no_expressions

    def process_reports(self, reports):
        """Process all reports"""
        print(f"🔍 Processing {len(reports)} dermatological image reports...")

        # Collectors
        all_normal_adj = []
        all_abnormal_adj = []
        all_directions = []
        all_nouns = []
        all_abbreviations = []
        all_no_expressions = []

        for i, report in enumerate(reports):
            if not report or len(report.strip()) < 10:
                continue

            # Extract various vocabulary types
            extracted = self.extract_from_text(report)

            all_normal_adj.extend(extracted['normal_adjectives'])
            all_abnormal_adj.extend(extracted['abnormal_adjectives'])
            all_directions.extend(extracted['direction_terms'])
            all_nouns.extend(extracted['anatomical_nouns'])
            all_abbreviations.extend(extracted['abbreviations'])

            # Extract 'no' expressions
            no_expressions = self.extract_no_expressions(report)
            all_no_expressions.extend(no_expressions)

            if (i + 1) % 100 == 0:
                print(f"  Processed {i + 1}/{len(reports)} reports...")

        # Count frequencies
        normal_adj_counts = Counter(all_normal_adj)
        abnormal_adj_counts = Counter(all_abnormal_adj)
        direction_counts = Counter(all_directions)
        noun_counts = Counter(all_nouns)
        abbr_counts = Counter(all_abbreviations)
        no_expr_counts = Counter(all_no_expressions)

        print(f"\n📊 Extraction result statistics:")
        print(f"  Normal adjectives: {len(normal_adj_counts)} types")
        print(f"  Abnormal adjectives: {len(abnormal_adj_counts)} types")
        print(f"  Direction terms: {len(direction_counts)} types")
        print(f"  Anatomical nouns: {len(noun_counts)} types")
        print(f"  Abbreviations: {len(abbr_counts)} types")
        print(f"  No expressions: {len(no_expr_counts)} types")

        return self.generate_vocabulary_dict(
            normal_adj_counts, abnormal_adj_counts, direction_counts,
            noun_counts, abbr_counts, no_expr_counts
        )

    def generate_vocabulary_dict(self, normal_adj_counts, abnormal_adj_counts,
                                 direction_counts, noun_counts, abbr_counts, no_expr_counts):
        """Generate vocabulary dictionary, filtered by frequency"""

        vocabulary = {
            'normal_adjectives': [],
            'abnormal_adjectives': [],
            'direction_terms': [],
            'anatomical_nouns': [],
            'abbreviations': [],
            'no_expressions': [],
            'high_priority_combinations': [],
            'suggested_prompts': []
        }

        # Normal adjectives (frequency >= 2)
        for adj, count in normal_adj_counts.most_common(20):
            if count >= 2:
                vocabulary['normal_adjectives'].append(adj)

        # Abnormal adjectives (frequency >= 3)
        for adj, count in abnormal_adj_counts.most_common(30):
            if count >= 3:
                vocabulary['abnormal_adjectives'].append(adj)

        # Direction terms (frequency >= 1)
        for direction, count in direction_counts.most_common(50):
            if count >= 1:
                vocabulary['direction_terms'].append(direction)

        # Anatomical nouns (frequency >= 5)
        for noun, count in noun_counts.most_common(40):
            if count >= 5:
                vocabulary['anatomical_nouns'].append(noun)

        # Abbreviations (frequency >= 2)
        for abbr, count in abbr_counts.most_common(20):
            if count >= 2:
                vocabulary['abbreviations'].append(abbr)

        # No expressions (frequency >= 2)
        for expr, count in no_expr_counts.most_common(30):
            if count >= 2:
                vocabulary['no_expressions'].append(expr)

        # Generate high-priority combinations
        self.generate_combinations(vocabulary)

        # Generate suggested prompts
        self.generate_suggested_prompts(vocabulary)

        return vocabulary

    def generate_combinations(self, vocabulary):
        """Generate high-priority vocabulary combinations"""
        combinations = []

        # Abnormal adjectives + anatomical nouns
        abnormal_adj = vocabulary['abnormal_adjectives'][:10]
        nouns = vocabulary['anatomical_nouns'][:15]

        for adj in abnormal_adj:
            for noun in nouns:
                combinations.append(f"{adj} {noun}")
                if len(combinations) >= 15:
                    break
            if len(combinations) >= 15:
                break

        # Dermatology-specific combinations
        skin_structures = ['skin', 'lesion', 'nevus', 'mole', 'pigmented']
        skin_pathologies = ['melanoma', 'carcinoma', 'lesion', 'tumor', 'abnormality']

        for structure in skin_structures:
            if structure in vocabulary['anatomical_nouns'] or structure in vocabulary['abnormal_adjectives']:
                for pathology in skin_pathologies:
                    if pathology in vocabulary['anatomical_nouns']:
                        combinations.append(f"{structure} {pathology}")

        vocabulary['high_priority_combinations'] = combinations[:20]

    def generate_suggested_prompts(self, vocabulary):
        """Generate suggested detection prompts"""
        prompts = []

        # Skin lesion combinations
        structures = ['skin', 'lesion', 'nevus', 'mole', 'pigmented', 'melanocytic']
        pathologies = ['melanoma', 'carcinoma', 'lesion', 'tumor', 'abnormality', 'dysplasia']

        for structure in structures:
            if structure in vocabulary['anatomical_nouns'] or structure in vocabulary['abnormal_adjectives']:
                for pathology in pathologies[:2]:
                    if pathology in vocabulary['anatomical_nouns']:
                        prompts.append(f"{structure} {pathology}")

        # High-frequency abnormal adjective combinations
        top_abnormal_adj = vocabulary['abnormal_adjectives'][:5]
        top_nouns = vocabulary['anatomical_nouns'][:5]

        for adj in top_abnormal_adj:
            for noun in top_nouns:
                prompts.append(f"{adj} {noun}")
                if len(prompts) >= 20:
                    break
            if len(prompts) >= 20:
                break

        vocabulary['suggested_prompts'] = prompts[:15]

    def print_vocabulary_for_code(self, vocabulary):
        """Print format that can be directly copied into code"""
        print("\n" + "=" * 100)
        print(
            "The following content can be directly copied into grounding_dino_medical_enhanced.py to replace the 'Dermatological' section:")
        print("=" * 100)
        print()
        print("'Dermatological': {")

        # Print each category
        categories = [
            ('normal_adjectives', 'normal adjectives'),
            ('abnormal_adjectives', 'abnormal adjectives'),
            ('direction_terms', 'direction terms'),
            ('anatomical_nouns', 'anatomical nouns'),
            ('abbreviations', 'abbreviations'),
            ('high_priority_combinations', 'high-priority combinations'),
            ('suggested_prompts', 'suggested prompts'),
            ('no_expressions', 'no-related expressions')
        ]

        for key, desc in categories:
            items = vocabulary[key]
            if not items:
                continue

            print(f"    '{key}': [  # {desc}")

            # Format vocabulary as string list
            formatted_items = [f"'{item}'" for item in items]

            # Print line by line, maximum 8 items per line
            items_per_line = 8
            for i in range(0, len(formatted_items), items_per_line):
                line_items = formatted_items[i:i + items_per_line]
                if i + items_per_line >= len(formatted_items):
                    # Last line, no trailing comma
                    print("        " + ", ".join(line_items))
                else:
                    # Not last line, add trailing comma
                    print("        " + ", ".join(line_items) + ",")

            print("    ],")

        print("},")
        print()
        print("=" * 100)


def main():
    """Main function"""
    dataset_root = "/root/autodl-tmp/dataset"
    target_folder = "Dermatological Imaging"

    print(f"🚀 Dermatological image vocabulary collector started...")
    print(f"📂 Target category: {target_folder}")

    # Initialize collector
    collector = DermatologicalVocabularyCollector()

    # Construct file path
    folder_path = os.path.join(dataset_root, target_folder)
    jsonl_file = os.path.join(folder_path, f"{target_folder}_en.jsonl")

    if not os.path.exists(jsonl_file):
        print(f"❌ File not found: {jsonl_file}")
        return

    print(f"📄 Loading data file: {jsonl_file}")

    # Load data
    data = load_jsonl(jsonl_file)
    if not data:
        print("❌ Failed to load data or data is empty")
        return

    print(f"📊 Successfully loaded {len(data)} records")

    # Extract report texts
    reports = []
    for item in data:
        if 'report' in item and item['report'] and len(item['report'].strip()) > 10:
            reports.append(item['report'])

    print(f"📋 Extracted {len(reports)} valid reports")

    if not reports:
        print("❌ No valid report texts found")
        return

    # Process reports and generate vocabulary
    vocabulary = collector.process_reports(reports)

    # Print detailed statistics
    print(f"\n📈 Final vocabulary statistics:")
    for key, items in vocabulary.items():
        if key.endswith('_adjectives') or key.endswith('_terms') or key.endswith('_nouns'):
            print(f"  {key}: {len(items)} items")

    # Print partial content preview
    print(f"\n🔍 Content preview:")
    preview_keys = ['normal_adjectives', 'abnormal_adjectives', 'direction_terms', 'anatomical_nouns']
    for key in preview_keys:
        if key in vocabulary:
            items = vocabulary[key][:10]
            print(f"  {key}[:10]: {items}")

    # Print copyable code format
    collector.print_vocabulary_for_code(vocabulary)

    # Create output directory
    output_dir = "category_result"
    os.makedirs(output_dir, exist_ok=True)

    # Save as JSON format
    json_file = os.path.join(output_dir, "Dermatological_vocabulary.json")

    # Add statistics to vocabulary
    vocabulary_with_stats = {
        "category": "Dermatological",
        "total_reports_processed": len(reports),
        "extraction_date": "2024-12-19",
        "vocabulary": vocabulary,
        "statistics": {
            key: len(items) for key, items in vocabulary.items()
        }
    }

    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(vocabulary_with_stats, f, ensure_ascii=False, indent=2)

    # Also save in Python format (for direct copying)
    py_file = os.path.join(output_dir, "Dermatological_vocabulary.py")
    with open(py_file, 'w', encoding='utf-8') as f:
        f.write(
            "# Enhanced dermatological image vocabulary - can be directly copied to grounding_dino_medical_enhanced.py\n\n")
        f.write("enhanced_dermatological_vocabulary = {\n")
        f.write("    'Dermatological': {\n")

        for key, items in vocabulary.items():
            if not items:
                continue

            f.write(f"        '{key}': [  # {len(items)} items\n")

            # Format as compact format
            formatted_items = [f"'{item}'" for item in items]
            items_per_line = 8

            for i in range(0, len(formatted_items), items_per_line):
                line_items = formatted_items[i:i + items_per_line]
                if i + items_per_line >= len(formatted_items):
                    # Last line
                    f.write("            " + ", ".join(line_items) + "\n")
                else:
                    # Not last line
                    f.write("            " + ", ".join(line_items) + ",\n")

            f.write("        ],\n")

        f.write("    }\n")
        f.write("}\n")

        # Add statistics
        f.write(f"\n# Statistics:\n")
        for key, items in vocabulary.items():
            f.write(f"# {key}: {len(items)} items\n")
        f.write(f"# Total reports processed: {len(reports)}\n")

    print(f"\n💾 Vocabulary saved to:")
    print(f"  📄 JSON format: {json_file}")
    print(f"  🐍 Python format: {py_file}")
    print("🎉 Dermatological image vocabulary collection completed!")
    print(f"\n📁 Results saved in {output_dir} folder")
    print("📋 After collecting all 12 categories, you can use these JSON files to generate a complete vocabulary")


if __name__ == "__main__":
    main()