              
import json
import os
import argparse
import sys
from pathlib import Path
from typing import List, Dict, Any, Optional
import time
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from openai import OpenAI


class RoleCharacteristicExtractor:
                 

    def __init__(self, api_key: str = None, base_url: str = None, batch_size: int = 5, concurrent_limit: int = 3):       
        self.batch_size = batch_size
        self.concurrent_limit = concurrent_limit
        self.client = None
        self.executor = ThreadPoolExecutor(max_workers=concurrent_limit)

        self.setup_openai_client(api_key, base_url)

    def setup_openai_client(self, api_key: str = None, base_url: str = None):
                         
        try:
            self.client = OpenAI(
                api_key=api_key or "{your api_key}",
                base_url=base_url or "{your base_url}"
            )
        except Exception as e:
            self.client = None

    def load_question_data(self, file_path: str) -> List[Dict[str, Any]]:
        if not os.path.exists(file_path):
            return []

        question_data = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    try:
                        if line.strip():
                            data = json.loads(line.strip())
                            question_data.append(data)
                    except json.JSONDecodeError as e:
                        continue

            return question_data

        except Exception as e:
            return []

    def extract_role_characteristics(self, content: str) -> Optional[Dict[str, Any]]:
        if not self.client:
            return None

                  
        prompt = self._build_english_prompt(content)
        system_message = "You are a professional user profiling analyst, skilled at analyzing gaming background, skill level, and personal characteristics from user questions. Please return results strictly in JSON format."

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1000,
                temperature=0.3
            )

            result = response.choices[0].message.content.strip()
            return self._parse_extraction_result(result, content)

        except Exception as e:
            return None

    def _build_english_prompt(self, content: str) -> str:
                     
        return f"""
Please analyze the following gaming player's question and generate a concise player background description.

User Question:
{content}

Based on the question content, write a 50-100 word player background description that describes the player's gaming experience, skill level, interests and preferences.

Requirements:
1. The description should be natural and fluent, like a brief introduction of a person
2. Only make reasonable inferences based on the question content, don't over-interpret
3. If the question is too simple or contains no personal information, return null
4. Use second-person description

Please return in JSON format:
{{
  "player_description": "You are a player who...",
  "confidence_score": 0.8
}}

If no meaningful player information can be inferred from the question, please return:
{{
  "player_description": null,
  "confidence_score": 0.0
}}
"""

    def _parse_extraction_result(self, result: str, original_content: str) -> Optional[Dict[str, Any]]:
                    
        try:
                             
            if result.startswith('```json'):
                result = result.replace('```json', '').replace('```', '').strip()
            elif result.startswith('```'):
                result = result.replace('```', '').strip()

                      
            if result.startswith('{') and result.endswith('}'):
                parsed_result = json.loads(result)

                        
                if isinstance(parsed_result, dict):
                    player_description = parsed_result.get('player_description')
                    confidence_score = parsed_result.get('confidence_score', 0.0)

                                                        
                    if not player_description:
                        return None

                             
                    final_result = {
                        'player_description': player_description,
                        'confidence_score': confidence_score,
                        'source_content': original_content[:100] + ('...' if len(original_content) > 100 else '')
                    }

                    return final_result

            return None

        except json.JSONDecodeError as e:
            return None
        except Exception as e:
            return None

    def _extract_single_role(self, data: Dict[str, Any], index: int, verbose: bool = False) -> Optional[Dict[str, Any]]:
                              
        content = data.get('content_original', '')

        if not content or len(content.strip()) < 10:
            if verbose:
                print(f"question {index} content is too short, skip")
            return None
    
        role_info = self.extract_role_characteristics(content)

        if role_info:
                         
            role_info['source_question_id'] = data.get('id', f'question_{index}')
            role_info['source_topic'] = data.get('topic', '')
            role_info['source_date'] = data.get('date', '')

            if verbose:
                confidence = role_info.get('confidence_score', 0)
                description = role_info.get('player_description', '')[:50]
            return role_info
        else:
            if verbose:
                print(f"extracting role for question {index} failed")
            return None

    def batch_extract_roles(self, question_data: List[Dict[str, Any]],
                            output_file: str, verbose: bool = False) -> int:   

        extracted_roles = []
        success_count = 0

                  
        output_dir = os.path.dirname(output_file)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)

               
        pbar = tqdm(total=len(question_data), desc="Extracting role features")

                   
        for i in range(0, len(question_data), self.concurrent_limit):
            batch = question_data[i:i + self.concurrent_limit]
            batch_indices = list(range(i, min(i + self.concurrent_limit, len(question_data))))

                       
            futures = []
            for j, data in enumerate(batch):
                future = self.executor.submit(
                    self._extract_single_role,
                    data,
                    batch_indices[j],
                    verbose
                )
                futures.append(future)

                  
            batch_results = []
            for future in futures:
                try:
                    result = future.result(timeout=60)         
                    if result:
                        batch_results.append(result)
                        success_count += 1
                except Exception as e:
                    if verbose:
                        print(f"extracting role for question failed: {e}")
                pbar.update(1)

                    
            if batch_results:
                extracted_roles.extend(batch_results)
                self._append_to_file(batch_results, output_file)
                if verbose:
                    print(f"Extracting role for question failed")

        pbar.close()


        return success_count

    def _append_to_file(self, roles: List[Dict[str, Any]], output_file: str):
                         
        with open(output_file, 'a', encoding='utf-8') as f:
            for role in roles:
                f.write(json.dumps(role, ensure_ascii=False) + '\n')

    def analyze_extracted_roles(self, output_file: str):
                       
        if not os.path.exists(output_file):
            return

        roles = []
        with open(output_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    try:
                        roles.append(json.loads(line.strip()))
                    except BaseException:
                        continue

        if not roles:
            return


               
        confidences = [role.get('confidence_score', 0) for role in roles if role.get('confidence_score')]
        if confidences:
            avg_confidence = sum(confidences) / len(confidences)
            high_confidence_count = len([c for c in confidences if c > 0.7])

                
        descriptions = [role.get('player_description', '') for role in roles if role.get('player_description')]
        if descriptions:
            desc_lengths = [len(desc) for desc in descriptions]
            avg_length = sum(desc_lengths) / len(desc_lengths)

                  
        for i, role in enumerate(roles[:5]):
            description = role.get('player_description', '')
            confidence = role.get('confidence_score', 0)

                       
        if len(roles) > 5:
            high_quality_roles = sorted(roles, key=lambda x: x.get('confidence_score', 0), reverse=True)[:3]
            for i, role in enumerate(high_quality_roles):
                description = role.get('player_description', '')
                confidence = role.get('confidence_score', 0)


def parse_args():
                 
    parser = argparse.ArgumentParser(
        description="Extract role characteristics from gaming player questions",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Usage examples:
  python role_chart_extraction.py --input question_data_ch.jsonl --output roles_ch.jsonl
  python role_chart_extraction.py --input question_data_en.jsonl --output roles_en.jsonl --verbose
  python role_chart_extraction.py --input /path/to/data.jsonl --batch_size 3 --analyze
        """
    )

    parser.add_argument(
        '--input',
        type=str,
        required=True,
        help="Input question data file path (JSONL format)"
    )

    parser.add_argument(
        '--output',
        type=str,
        default=None,
        help="Output role file path, defaults to input_filename_roles.jsonl"
    )

    parser.add_argument(
        '--batch_size',
        type=int,
        default=5,
        help="Batch save size, default 5"
    )

    parser.add_argument(
        '--concurrent_limit',
        type=int,
        default=3,
        help="Concurrent request limit, default 3"
    )

    parser.add_argument(
        '--api_key',
        type=str,
        default=None,
        help="OpenAI API key"
    )

    parser.add_argument(
        '--base_url',
        type=str,
        default=None,
        help="OpenAI API base URL"
    )

    parser.add_argument(
        '--verbose',
        action='store_true',
        help="Show detailed processing information"
    )

    parser.add_argument(
        '--analyze',
        action='store_true',
        help="Analyze role characteristic distribution after processing"
    )

    return parser.parse_args()


def main():
             
    args = parse_args()

            
    if not os.path.exists(args.input):
        sys.exit(1)

              
    if args.output is None:
        input_path = Path(args.input)
        args.output = str(input_path.parent / f"{input_path.stem}_roles.jsonl")


            
    extractor = RoleCharacteristicExtractor(
        api_key=args.api_key,
        base_url=args.base_url,
        batch_size=args.batch_size,
        concurrent_limit=args.concurrent_limit
    )

    if not extractor.client:
        sys.exit(1)

          
    question_data = extractor.load_question_data(args.input)
    if not question_data:
        sys.exit(1)

                  
    if os.path.exists(args.output):
        overwrite = input(f"\nOutput file already exists: {args.output}\nOverwrite? (y/n): ").lower()
        if overwrite == 'y':
            os.remove(args.output)
        else:
            sys.exit(0)

          
    start_time = time.time()

    success_count = extractor.batch_extract_roles(
        question_data,
        args.output,
        verbose=args.verbose
    )

    end_time = time.time()
    processing_time = end_time - start_time


          
    if args.analyze and success_count > 0:
        extractor.analyze_extracted_roles(args.output)


if __name__ == "__main__":
    main()
