import pandas as pd
import csv
import json
import re
from tqdm import tqdm
import time
from langdetect import detect, DetectorFactory, LangDetectException
from tools import initialize_absa_model, infer_review_with_local_model
from model_loader import model_manager
import os

# Set seed for consistent language detection
DetectorFactory.seed = 0

class ReviewDatasetLabeler:
    def __init__(self, input_csv_path, output_csv_path="absa_labeled_dataset.csv", cleaned_csv_path=None):
        self.input_csv_path = input_csv_path
        self.output_csv_path = output_csv_path
        # Auto-generate cleaned file name if not provided
        if cleaned_csv_path is None:
            base_name = os.path.splitext(os.path.basename(input_csv_path))[0]
            self.cleaned_csv_path = f"{base_name}_cleaned.csv"
        else:
            self.cleaned_csv_path = cleaned_csv_path
        self.model_loaded = False
        
    def is_english(self, text):
        """Check if text is in English - MORE LENIENT"""
        try:
            if len(text.strip()) < 3:  # Too short to detect reliably
                # Assume short text is English if it contains common English words
                english_indicators = ['good', 'bad', 'great', 'nice', 'love', 'hate', 'app', 'game', 'ok', 'fine', 'cool', 'wow']
                text_lower = text.lower()
                return any(word in text_lower for word in english_indicators)
            
            detected_lang = detect(text)
            
            # Be more lenient - accept English and close languages
            accepted_langs = ['en', 'cy', 'ga']  # English, Welsh, Irish (often confused)
            
            if detected_lang in accepted_langs:
                return True
                
            # Additional check for mixed content
            # If text contains significant English words, keep it
            english_words = ['the', 'and', 'is', 'app', 'good', 'bad', 'great', 'nice', 'game', 'this', 'that', 'very', 'really', 'like', 'love', 'hate']
            words = text.lower().split()
            english_word_count = sum(1 for word in words if word in english_words)
            
            # If >30% of words are common English words, consider it English
            if len(words) > 0 and (english_word_count / len(words)) > 0.3:
                return True
                
            return False
            
        except (LangDetectException, Exception):
            # If detection fails, be conservative and assume it might be English
            # Check for basic English patterns
            english_patterns = [
                r'\b(the|and|is|app|good|bad|great|nice|game|this|that)\b',
                r'\b(very|really|like|love|hate|awesome|terrible)\b',
                r'\b(work|working|works|download|update|bug|crash)\b'
            ]
            
            text_lower = text.lower()
            for pattern in english_patterns:
                if re.search(pattern, text_lower):
                    return True
            
            # If no English patterns found and detection failed, 
            # assume non-English to be safe
            return False
        
    def clean_review_text(self, text):
        """Clean and preprocess review text - MUCH LESS AGGRESSIVE"""
        if pd.isna(text) or not isinstance(text, str):
            return None
            
        # Remove extra whitespace and newlines only
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Only remove EXTREMELY short reviews (less than 2 characters)
        if len(text) < 2:
            return None
            
        # Only remove single character reviews
        if len(text.split()) < 1:
            return None
            
        # Only remove reviews that are purely numbers (no letters at all)
        if re.match(r'^[\d\s\.\,\!\?\-\+\*\/\=\(\)\[\]]*$', text) and not re.search(r'[a-zA-Z]', text):
            return None
            
        # Very lenient special character filtering - only remove if >90% special chars
        if len(text) > 0:
            letter_count = len(re.findall(r'[a-zA-Z]', text))
            if letter_count == 0:  # No letters at all
                return None
            
            # Only remove if less than 10% are letters/numbers
            alphanumeric_ratio = len(re.findall(r'[a-zA-Z0-9]', text)) / len(text)
            if alphanumeric_ratio < 0.1:
                return None
            
        return text
    
    def clean_and_save_dataset(self, force_recreate=False):
        """Clean the dataset and save to local file - ONLY RUNS ONCE"""
        # Check if cleaned file already exists
        if os.path.exists(self.cleaned_csv_path) and not force_recreate:
            print(f"✅ Cleaned dataset already exists: {self.cleaned_csv_path}")
            cleaned_df = pd.read_csv(self.cleaned_csv_path)
            print(f"📊 Loaded {len(cleaned_df)} cleaned reviews from cache")
            return cleaned_df
        
        print("📊 Starting dataset cleaning and language detection...")
        print("⚠️  This will only be done once and saved for future use")
        
        try:
            # Load the CSV file
            df = pd.read_csv(self.input_csv_path)
            print(f"Loaded {len(df)} reviews from {self.input_csv_path}")
            
            # Identify the review text column
            review_columns = ['review', 'review_description', 'content', 'text', 'review_text']
            review_col = None
            
            for col in review_columns:
                if col in df.columns:
                    review_col = col
                    break
            
            if review_col is None:
                print("Available columns:", df.columns.tolist())
                review_col = input("Enter the column name containing review text: ")
            
            print(f"Using column: '{review_col}' for reviews")
            
            # Step 1: Basic cleaning (VERY MINIMAL)
            print("🧹 Step 1: Minimal cleaning...")
            df['cleaned_review'] = df[review_col].apply(self.clean_review_text)
            df_step1 = df[df['cleaned_review'].notna()].copy()
            print(f"   After minimal cleaning: {len(df_step1)} reviews (removed {len(df) - len(df_step1)})")
            
            # Step 2: English language filtering with progress bar (MORE LENIENT)
            print("🌍 Step 2: Lenient English filtering...")
            tqdm.pandas(desc="Language detection")
            df_step1['is_english'] = df_step1['cleaned_review'].progress_apply(self.is_english)
            df_english = df_step1[df_step1['is_english']].copy()
            print(f"   After English filtering: {len(df_english)} reviews (removed {len(df_step1) - len(df_english)})")
            
            # Step 3: Remove only exact duplicates
            print("🔄 Step 3: Removing exact duplicates...")
            initial_count = len(df_english)
            # Only remove exact duplicates, case-insensitive
            df_english['review_lower'] = df_english['cleaned_review'].str.lower()
            df_cleaned = df_english.drop_duplicates(subset=['review_lower'])
            df_cleaned = df_cleaned.drop('review_lower', axis=1)
            print(f"   After duplicate removal: {len(df_cleaned)} reviews (removed {initial_count - len(df_cleaned)})")
            
            # Step 4: Sort by rating if available
            if 'rating' in df_cleaned.columns:
                df_cleaned = df_cleaned.sort_values('rating', ascending=True)
            elif 'score' in df_cleaned.columns:
                df_cleaned = df_cleaned.sort_values('score', ascending=True)
            
            # Calculate statistics
            total_removed = len(df) - len(df_cleaned)
            removal_percentage = (total_removed / len(df)) * 100
            basic_removed = len(df) - len(df_step1)
            english_removed = len(df_step1) - len(df_english)
            duplicate_removed = initial_count - len(df_cleaned)
            
            print(f"\n✅ Final cleaned dataset: {len(df_cleaned)} reviews")
            print(f"📉 Total removed: {total_removed} reviews ({removal_percentage:.1f}%)")
            print(f"📈 Retention rate: {100 - removal_percentage:.1f}%")
            
            print(f"\n📊 Removal breakdown:")
            print(f"   Basic cleaning: {basic_removed} reviews ({basic_removed/len(df)*100:.1f}%)")
            print(f"   Non-English: {english_removed} reviews ({english_removed/len(df)*100:.1f}%)")
            print(f"   Duplicates: {duplicate_removed} reviews ({duplicate_removed/len(df)*100:.1f}%)")
            
            # Save the cleaned dataset
            print(f"\n💾 Saving cleaned dataset to: {self.cleaned_csv_path}")
            df_cleaned.to_csv(self.cleaned_csv_path, index=False)
            print("✅ Cleaned dataset saved successfully!")
            print("📝 Future runs will load from this cleaned file")
            
            return df_cleaned
            
        except Exception as e:
            print(f"❌ Error during cleaning: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def load_cleaned_dataset(self):
        """Load the already cleaned dataset"""
        try:
            if not os.path.exists(self.cleaned_csv_path):
                print(f"❌ Cleaned dataset not found: {self.cleaned_csv_path}")
                print("🔄 Running cleaning process first...")
                return self.clean_and_save_dataset()
            
            print(f"📂 Loading cleaned dataset from: {self.cleaned_csv_path}")
            df_cleaned = pd.read_csv(self.cleaned_csv_path)
            print(f"✅ Loaded {len(df_cleaned)} cleaned reviews")
            return df_cleaned
            
        except Exception as e:
            print(f"❌ Error loading cleaned dataset: {e}")
            print("🔄 Attempting to recreate cleaned dataset...")
            return self.clean_and_save_dataset()
    
    def process_review_batch(self, reviews_batch, batch_num, total_batches):
        """Process a batch of reviews with ABSA - MINIMAL LOGGING"""
        results = []
        
        # Initialize progress bar for this batch
        pbar = tqdm(enumerate(reviews_batch), 
                   total=len(reviews_batch), 
                   desc=f"Batch {batch_num}/{total_batches}",
                   leave=False)
        
        for i, review in pbar:
            try:
                # Run ABSA inference
                absa_result = infer_review_with_local_model(review)
                
                # Parse the result
                aspects_list = []
                sentiment_dict = {}
                opinion_dict = {}
                
                if isinstance(absa_result, dict):
                    # Format: {'aspect_name': {'sentiment': '...', 'opinion': '...'}}
                    for aspect_name, aspect_data in absa_result.items():
                        if isinstance(aspect_data, dict):
                            aspects_list.append(aspect_name)
                            sentiment_dict[aspect_name] = aspect_data.get('sentiment', 'Unknown')
                            opinion_dict[aspect_name] = aspect_data.get('opinion', 'N/A')
                
                elif isinstance(absa_result, list):
                    # Format: [{'aspect': '...', 'sentiment': '...', 'opinion': '...'}]
                    for aspect_item in absa_result:
                        if isinstance(aspect_item, dict):
                            aspect_name = aspect_item.get('aspect', f'aspect_{len(aspects_list)}')
                            aspects_list.append(aspect_name)
                            sentiment_dict[aspect_name] = aspect_item.get('sentiment', 'Unknown')
                            opinion_dict[aspect_name] = aspect_item.get('opinion', 'N/A')
                
                # Create result row
                result_row = {
                    "review": review,
                    "aspects": str(set(aspects_list)) if aspects_list else "set()",
                    "aspects_sentiment": str(sentiment_dict),
                    "aspects_opinion": str(opinion_dict)
                }
                
                results.append(result_row)
                
                # Update progress bar description with current progress
                pbar.set_postfix({
                    'processed': f"{i+1}/{len(reviews_batch)}",
                    'aspects': len(aspects_list)
                })
                
            except Exception as e:
                # Add error entry without printing
                error_row = {
                    "review": review,
                    "aspects": "set()",
                    "aspects_sentiment": "{'error': 'processing_failed'}",
                    "aspects_opinion": "{'error': '" + str(e)[:50] + "'}"
                }
                results.append(error_row)
                continue
        
        pbar.close()
        return results
    
    def create_labeled_dataset(self, max_reviews=None, batch_size=50, start_from=0):
        """Create the labeled dataset using pre-cleaned data"""
        
        # Initialize ABSA model
        print("🚀 Initializing ABSA model...")
        if not initialize_absa_model():
            print("❌ Failed to initialize ABSA model")
            return False
        
        self.model_loaded = True
        print("✅ ABSA model loaded successfully")
        
        # Load cleaned data (no cleaning/language detection here)
        df_cleaned = self.load_cleaned_dataset()
        if df_cleaned is None:
            return False
        
        # Determine how many reviews to process
        total_reviews = len(df_cleaned)
        if max_reviews:
            total_reviews = min(max_reviews, total_reviews - start_from)
        else:
            total_reviews = total_reviews - start_from
        
        print(f"\n🎯 Processing {total_reviews} reviews starting from index {start_from}")
        
        # Get the reviews to process
        reviews_to_process = df_cleaned['cleaned_review'].iloc[start_from:start_from + total_reviews].tolist()
        
        # Prepare CSV file
        fieldnames = ["review", "aspects", "aspects_sentiment", "aspects_opinion"]
        
        # Check if output file exists to determine if we should append
        file_mode = 'a' if start_from > 0 else 'w'
        write_header = start_from == 0
        
        print(f"📝 Writing results to {self.output_csv_path}")
        
        with open(self.output_csv_path, file_mode, newline='', encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            if write_header:
                writer.writeheader()
            
            # Process in batches
            total_batches = (len(reviews_to_process) + batch_size - 1) // batch_size
            
            # Main progress bar for overall progress
            overall_pbar = tqdm(range(total_batches), desc="Overall Progress", unit="batch")
            
            for batch_num in range(total_batches):
                start_idx = batch_num * batch_size
                end_idx = min((batch_num + 1) * batch_size, len(reviews_to_process))
                batch_reviews = reviews_to_process[start_idx:end_idx]
                
                # Process batch
                batch_results = self.process_review_batch(batch_reviews, batch_num + 1, total_batches)
                
                # Write batch results
                for result in batch_results:
                    writer.writerow(result)
                
                # Flush to disk
                csvfile.flush()
                
                # Update overall progress
                overall_pbar.set_postfix({
                    'batch': f"{batch_num + 1}/{total_batches}",
                    'reviews': f"{end_idx}/{len(reviews_to_process)}"
                })
                overall_pbar.update(1)
                
                # Brief pause to prevent overheating
                time.sleep(0.5)
            
            overall_pbar.close()
        
        print(f"\n🎉 Dataset labeling completed!")
        print(f"📊 Processed {total_reviews} reviews")
        print(f"💾 Results saved to: {self.output_csv_path}")
        
        return True
    
    def resume_labeling(self, batch_size=50):
        """Resume labeling from where it left off"""
        try:
            # Check existing output file
            existing_df = pd.read_csv(self.output_csv_path)
            completed_reviews = len(existing_df)
            print(f"📊 Found {completed_reviews} already processed reviews")
            
            # Continue from where we left off
            return self.create_labeled_dataset(
                max_reviews=None, 
                batch_size=batch_size, 
                start_from=completed_reviews
            )
            
        except FileNotFoundError:
            print("📝 No existing output file found, starting from beginning")
            return self.create_labeled_dataset(batch_size=batch_size)
    
    def create_sample_dataset(self, sample_size=1000, batch_size=50):
        """Create a smaller sample dataset for testing"""
        print(f"🎯 Creating sample dataset with {sample_size} reviews")
        return self.create_labeled_dataset(max_reviews=sample_size, batch_size=batch_size)
    
    def clean_dataset_only(self, force_recreate=False):
        """Only run the cleaning and language detection, save the result"""
        return self.clean_and_save_dataset(force_recreate=force_recreate)

# Usage functions
def clean_dataset_only(input_csv_path, cleaned_csv_path=None, force_recreate=False):
    """Only clean the dataset and save it"""
    labeler = ReviewDatasetLabeler(input_csv_path, cleaned_csv_path=cleaned_csv_path)
    return labeler.clean_dataset_only(force_recreate=force_recreate)

def create_full_dataset(input_csv_path, output_csv_path="absa_labeled_dataset.csv", cleaned_csv_path=None, batch_size=50):
    """Create labeled dataset from CSV (using cleaned cache if available)"""
    labeler = ReviewDatasetLabeler(input_csv_path, output_csv_path, cleaned_csv_path)
    return labeler.create_labeled_dataset(batch_size=batch_size)

def create_sample_dataset(input_csv_path, sample_size=1000, output_csv_path="absa_sample_dataset.csv", cleaned_csv_path=None, batch_size=50):
    """Create a sample labeled dataset for testing"""
    labeler = ReviewDatasetLabeler(input_csv_path, output_csv_path, cleaned_csv_path)
    return labeler.create_sample_dataset(sample_size=sample_size, batch_size=batch_size)

def resume_dataset_creation(input_csv_path, output_csv_path="absa_labeled_dataset.csv", cleaned_csv_path=None, batch_size=50):
    """Resume dataset creation from where it left off"""
    labeler = ReviewDatasetLabeler(input_csv_path, output_csv_path, cleaned_csv_path)
    return labeler.resume_labeling(batch_size=batch_size)

if __name__ == "__main__":
    # Configuration
    INPUT_CSV = "Google Play Store Review Dataset - Copy.csv"
    OUTPUT_CSV = "absa_labeled_dataset.csv"
    CLEANED_CSV = "Google Play Store Review Dataset - Copy_cleaned.csv"  # Auto-generated name
    
    print("🔍 App Review ABSA Dataset Creator")
    print("=" * 50)
    
    # Ask user what they want to do
    print("\nOptions:")
    print("1. Clean dataset only (prepare for future ABSA processing)")
    print("2. Create sample dataset (1000 reviews)")
    print("3. Create full dataset (all reviews)")
    print("4. Resume dataset creation")
    print("5. Force recreate cleaned dataset")
    
    choice = input("\nEnter your choice (1-5): ").strip()
    
    if choice == "1":
        print("\n🧹 Cleaning dataset only...")
        print("📝 This will create a cleaned dataset for faster future processing")
        success = clean_dataset_only(INPUT_CSV, CLEANED_CSV)
        if success is not None:
            print(f"\n✅ Cleaned dataset saved to: {CLEANED_CSV}")
            print("📝 You can now run ABSA processing (options 2-4) which will be much faster!")
        
    elif choice == "2":
        print("\n🧪 Creating sample dataset...")
        success = create_sample_dataset(
            INPUT_CSV, 
            sample_size=1000, 
            output_csv_path="absa_sample_dataset.csv",
            cleaned_csv_path=CLEANED_CSV,
            batch_size=25
        )
        
    elif choice == "3":
        print("\n🚀 Creating full dataset...")
        confirm = input("This will process all reviews and may take several hours. Continue? (y/n): ")
        if confirm.lower() == 'y':
            success = create_full_dataset(
                INPUT_CSV, 
                OUTPUT_CSV,
                cleaned_csv_path=CLEANED_CSV,
                batch_size=50
            )
        else:
            print("❌ Cancelled")
            success = False
            
    elif choice == "4":
        print("\n🔄 Resuming dataset creation...")
        success = resume_dataset_creation(
            INPUT_CSV, 
            OUTPUT_CSV,
            cleaned_csv_path=CLEANED_CSV,
            batch_size=50
        )
        
    elif choice == "5":
        print("\n🔄 Force recreating cleaned dataset...")
        success = clean_dataset_only(INPUT_CSV, CLEANED_CSV, force_recreate=True)
        if success is not None:
            print(f"\n✅ Cleaned dataset recreated: {CLEANED_CSV}")
        
    else:
        print("❌ Invalid choice")
        success = False
    
    if choice in ["2", "3", "4"] and success:
        print("\n🎉 Dataset creation completed successfully!")
        output_file = OUTPUT_CSV if choice in ["3", "4"] else "absa_sample_dataset.csv"
        print(f"📂 Check your output file: {output_file}")
    elif choice in ["1", "5"] and success is not None:
        print("\n🎉 Dataset cleaning completed successfully!")
    else:
        if choice not in ["1", "5"]:
            print("\n❌ Dataset creation failed")