from google_play_scraper import reviews_all, Sort
from langdetect import detect, DetectorFactory
import pandas as pd
from langchain_ollama import OllamaLLM
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document
import tempfile
DetectorFactory.seed = 0 
import json
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from requests.adapters import HTTPAdapter, Retry
import urllib3
from google_play_scraper import app
import html

from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch
import ast

# Import the model manager
from model_loader import model_manager

def check_review_relevance_llm_json(reviews, app_info, recent_changes, target_version=None):
    """
    LLM relevance checker with JSON output for better parsing
    """
    
    if not reviews or not app_info:
        return reviews
    
    try:
        llm = OllamaLLM(model="llama3.1")
        
        # Process in smaller batches
        batch_size = 3
        all_relevant_reviews = []
        
        for i in range(0, len(reviews), batch_size):
            batch = reviews[i:i+batch_size]
            
            prompt = f"""APP DESCRIPTION: {app_info}

Analyze if each review is relevant to this app. Respond with JSON only:

REVIEWS:
"""
            for j, review in enumerate(batch):
                prompt += f"{j+1}. {review}\n"

            prompt += """
Respond with JSON format:
{
  "results": [
    {"review_number": 1, "relevant": true, "reason": "discusses app feature"},
    {"review_number": 2, "relevant": false, "reason": "about different service"}
  ]
}"""

            try:
                response = llm.invoke(prompt)
                relevant_batch = parse_json_response(response, batch)
                all_relevant_reviews.extend(relevant_batch)
                
            except Exception as e:
                print(f"Batch error: {e}, keeping all reviews in batch")
                all_relevant_reviews.extend(batch)
        
        print(f"LLM JSON filter: {len(reviews)} → {len(all_relevant_reviews)} reviews")
        return all_relevant_reviews
        
    except Exception as e:
        print(f"LLM JSON relevance check failed: {e}")
        return reviews

def parse_json_response(response, batch_reviews):
    """Parse JSON response from LLM"""
    
    try:
        # Extract JSON from response
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            data = json.loads(json_str)
            
            relevant_reviews = []
            for result in data.get("results", []):
                review_num = result.get("review_number", 1) - 1
                is_relevant = result.get("relevant", True)
                reason = result.get("reason", "")
                
                if 0 <= review_num < len(batch_reviews):
                    if is_relevant:
                        relevant_reviews.append(batch_reviews[review_num])
                        print(f"✅ KEPT: {batch_reviews[review_num][:]}... ({reason})")
                    else:
                        print(f"❌ FILTERED: {batch_reviews[review_num][:]}... ({reason})")

            return relevant_reviews
    except Exception as e:
        print(f"JSON parsing error: {e}")
    
    return batch_reviews


def initialize_absa_model():
    """Initialize the ABSA model once at startup"""
    print("🚀 Initializing ABSA model...")
    model, tokenizer = model_manager.load_model()
    return model is not None and tokenizer is not None

def infer_review_with_local_model(review):
    """Inference function using the pre-loaded model"""
    try:
        # Get the pre-loaded model
        model, tokenizer = model_manager.get_model()
        
        if model is None or tokenizer is None:
            print("❌ Model not loaded, attempting to load...")
            if not initialize_absa_model():
                return [{
                    "aspect": "general",
                    "sentiment": "neutral",
                    "opinion": "Model not loaded"
                }]
            model, tokenizer = model_manager.get_model()

        # Ensure model is in inference mode
        FastLanguageModel.for_inference(model)

        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze the following review and extract aspects with their sentiments and opinions. Return the result as a JSON array where each object contains "aspect", "sentiment", and "opinion" fields.

### Input:
{review}

### Response:
"""

        input_text = alpaca_prompt.format(review=review)
        inputs = tokenizer([input_text], return_tensors="pt")
        
        # Move to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():  # Disable gradients for inference
            outputs = model.generate(
                **inputs, 
                max_new_tokens=128, 
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False,
                temperature=1.0
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        def extract_json_response(generated_text):
            marker = "### Response:"
            start = generated_text.find(marker)
            if start == -1:
                return None
            response = generated_text[start + len(marker):].strip()
            response = response.split("<|end_of_text|>")[0].strip()
            try:
                json_response = ast.literal_eval(response)
            except Exception as e:
                print(f"JSON parsing error: {e}")
                # Return the raw response if JSON parsing fails
                json_response = [{
                    "aspect": "general",
                    "sentiment": "neutral",
                    "opinion": response[:200]
                }]
            return json_response

        result = extract_json_response(generated_text)
        
        # Clean up tensors but keep model loaded
        del inputs, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        # IMPROVED DEBUG OUTPUT
        print(f"📝 Review: {review[:50]}...")
        print(f"🔍 ABSA Result: {result}")
        return result
        
    except Exception as e:
        print(f"❌ ABSA inference error: {e}")
        return [{
            "aspect": "general",
            "sentiment": "neutral",
            "opinion": f"Error: {str(e)[:100]}"
        }]



def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# In tools.py, update get_app_changelog_versions function
def get_app_changelog_versions(app_id, verify_ssl=False):
    """
    Fetch the changelog versions from AppBrain changelog page for given app_id.
    
    Args:
        app_id (str): The app identifier used in the URL.
        verify_ssl (bool): Whether to verify SSL certificates. Default True.
                           Set False to disable SSL verification (insecure).
    
    Returns:
        List[str]: List of version strings sorted by latest date first (no duplicates).
    """
    url = f"https://www.appbrain.com/app_details/{app_id}?changelog=true"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
    }

    # Setup session with retries
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.3,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    if not verify_ssl:
        # Disable warnings if SSL verification disabled
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    try:
        response = session.get(url, headers=headers, timeout=10, verify=verify_ssl)
        response.raise_for_status()
    except requests.exceptions.SSLError as ssl_err:
        if verify_ssl:
            # If SSL error, try again with SSL verify disabled as fallback
            print("[Warning] SSL error occurred, retrying with SSL verification disabled...")
            return get_app_changelog_versions(app_id, verify_ssl=False)
        else:
            # Already tried disabling SSL verify, re-raise
            raise ssl_err
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    changelog_ul = soup.find("ul", class_="app-changelog")
    if not changelog_ul:
        print("Changelog section not found.")
        return []

    entries = []
    for li in changelog_ul.find_all("li"):
        date_span = li.find("span", class_="date")
        desc_span = li.find("span", class_="description")
        if date_span and desc_span:
            try:
                date_obj = datetime.strptime(date_span.text.strip(), "%b %d, %Y")
            except ValueError:
                continue
            version_text = desc_span.text.strip().replace("Version", "").strip()
            entries.append({"date": date_obj, "version": version_text})

    # Sort latest first
    entries.sort(key=lambda x: x["date"], reverse=True)

    # Remove duplicates while preserving order
    versions = []
    seen = set()
    for entry in entries:
        version = entry["version"]
        if version not in seen and version.strip():  # Also check for empty versions
            seen.add(version)
            versions.append(version)

    return versions

def save_app_info_raw_text(app_id, lang='en', country='us', output_file="app_info.txt"):
    result = app(app_id, lang=lang, country=country)
    text = f"Title: {result.get('title', '')}\nDescription: {result.get('description', '')}\n"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(text)
    return text


def get_recent_changes(app_id):
    """
    Fetch recent changes/updates from AppBrain app page for given app_id.
    
    Args:
        app_id (str): The app identifier used in the URL.
    
    Returns:
        str: Recent changes text or None if not found.
    """
    url = f"https://www.appbrain.com/app/{app_id}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # First try to get full content from data-contents attribute
        desc_link = soup.find("a", {"id": "descLink"})
        if desc_link and desc_link.get("data-contents"):
            # Decode HTML entities in the data-contents
            full_content = html.unescape(desc_link.get("data-contents"))
            # Remove HTML tags
            full_content_soup = BeautifulSoup(full_content, "html.parser")
            full_text = full_content_soup.get_text()
        else:
            # Fallback to descContents if data-contents not available
            desc_contents = soup.find(id="descContents")
            if desc_contents:
                full_text = desc_contents.get_text()
            else:
                return "Description content not found"
        
        # Look for "Recent changes:" in the full text
        if "Recent changes:" in full_text:
            recent_changes = full_text.split("Recent changes:", 1)[1].strip()
            return recent_changes
        else:
            return "No recent changes found in description"
            
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def filter_reviews_by_aspect(reviews_df, query, similarity_threshold=0.1):
    """
    Filter reviews by aspect and return just a list of review texts.
    
    Args:
        reviews_df: DataFrame with 'review_description' column
        query: Developer query (e.g., "login issues" or "login and payment issues")
        similarity_threshold: Minimum similarity score (default 0.1)
    
    Returns:
        list: List of relevant review texts, or empty list if none found
    """
    
    if reviews_df.empty or 'review_description' not in reviews_df.columns:
        return []
    
    # Parse aspects from query
    aspects = parse_aspects_from_query(query)
    
    if not aspects:
        return []
    
    all_reviews = []
    
    for aspect in aspects:
        # Get reviews for this aspect
        aspect_reviews = get_reviews_for_aspect(reviews_df, aspect, similarity_threshold)
        all_reviews.extend(aspect_reviews)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_reviews = []
    for review in all_reviews:
        if review not in seen:
            seen.add(review)
            unique_reviews.append(review)
    
    return unique_reviews

def parse_aspects_from_query(query):
    """Extract aspects from developer query using LLM - dynamic approach"""
    
    try:
        from langchain_ollama import OllamaLLM
        llm = OllamaLLM(model="llama3.1")
        
        prompt = f"""Analyze the following developer query and extract mobile app aspects mentioned.

QUERY: "{query}"

Common mobile app aspects include: login, payment, UI, performance, crash, network, notification, battery, data, security, content, navigation, search, settings, feedback, social, offline, sync, media, accessibility, etc.

Instructions:
- Extract specific aspects mentioned in the query
- Return only relevant aspects, not all possible ones
- If no specific aspects found, return "general"
- Be specific (e.g., "login" not "authentication system")
- Return as comma-separated list

Examples:
Query: "login issues in version 2.1" → login
Query: "payment and UI problems" → payment, ui
Query: "app crashes when loading" → crash, performance
Query: "what are the main problems" → general

Extract aspects from the query above:"""

        response = llm.invoke(prompt)
        
        # Parse the response
        response_text = response.strip().lower()
        
        # Extract aspects from response
        if "→" in response_text:
            # Handle example format
            aspects_part = response_text.split("→")[-1].strip()
        else:
            aspects_part = response_text
        
        # Clean and split aspects
        aspects = []
        if aspects_part:
            # Split by common delimiters
            for delimiter in [',', ';', 'and', '&']:
                if delimiter in aspects_part:
                    aspects = [aspect.strip() for aspect in aspects_part.split(delimiter)]
                    break
            
            if not aspects:
                # Single aspect
                aspects = [aspects_part.strip()]
        
        # Clean aspects and remove empty ones
        cleaned_aspects = []
        for aspect in aspects:
            aspect = aspect.strip().replace('"', '').replace("'", '')
            if aspect and len(aspect) > 1:
                cleaned_aspects.append(aspect)
        
        # Fallback logic
        if not cleaned_aspects:
            # If LLM fails, use simple keyword detection as fallback
            return fallback_aspect_detection(query)
        
        return cleaned_aspects
        
    except Exception as e:
        print(f"❌ LLM aspect extraction error: {e}")
        # Fallback to simple detection
        return fallback_aspect_detection(query)

def fallback_aspect_detection(query):
    """Simple fallback when LLM fails"""
    
    # Very basic keyword matching as last resort
    basic_keywords = {
        'login': ['login', 'log in', 'sign in', 'auth'],
        'payment': ['payment', 'pay', 'purchase'],
        'crash': ['crash', 'freeze', 'bug', 'error'],
        'ui': ['ui', 'interface', 'design'],
        'performance': ['slow', 'speed', 'lag', 'performance']
    }
    
    query_lower = query.lower()
    detected = []
    
    for aspect, keywords in basic_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            detected.append(aspect)
    
    return detected if detected else ['general']

def get_reviews_for_aspect(reviews_df, aspect, similarity_threshold):
    """Get reviews for a single aspect - optimized"""
    
    # Step 1: Keyword pre-filtering
    filtered_df = keyword_filter(reviews_df, aspect)
    
    if len(filtered_df) == 0:
        # No keyword matches, use semantic search on all
        return semantic_search(reviews_df, aspect, similarity_threshold)
    elif len(filtered_df) > 100:
        # Too many matches, use semantic search on sample
        sample_df = filtered_df.sample(n=min(100, len(filtered_df)), random_state=42)
        return semantic_search(sample_df, aspect, similarity_threshold)
    else:
        # Good size, use semantic search on filtered
        return semantic_search(filtered_df, aspect, similarity_threshold)

def keyword_filter(reviews_df, aspect):
    """Fast keyword filtering"""
    
    keyword_map = {
        'login': ['login', 'log in', 'sign in', 'signin', 'auth', 'password', 'username'],
        'payment': ['payment', 'pay', 'purchase', 'billing', 'charge', 'transaction', 'credit'],
        'crash': ['crash', 'freeze', 'stuck', 'hang', 'error', 'bug', 'broken'],
        'ui': ['ui', 'interface', 'design', 'layout', 'button', 'menu', 'screen'],
        'performance': ['slow', 'speed', 'lag', 'loading', 'performance', 'fast'],
        'network': ['network', 'internet', 'connection', 'offline', 'sync'],
        'notification': ['notification', 'alert', 'push', 'reminder'],
        'battery': ['battery', 'drain', 'power', 'usage'],
        'data': ['data', 'storage', 'memory', 'backup', 'restore']
    }
    
    keywords = keyword_map.get(aspect.lower(), [aspect.lower()])
    
    def has_keywords(text):
        text_lower = str(text).lower()
        return any(keyword in text_lower for keyword in keywords)
    
    return reviews_df[reviews_df['review_description'].apply(has_keywords)]

def semantic_search(reviews_df, aspect, similarity_threshold):
    """Semantic search returning list of review texts"""
    
    if reviews_df.empty:
        return []
    
    try:
        # Reset index
        df_reset = reviews_df.reset_index(drop=True)
        
        # Create embeddings
        embeddings = OllamaEmbeddings(model="mxbai-embed-large")
        
        # Convert to documents
        documents = []
        for idx, row in df_reset.iterrows():
            doc = Document(
                page_content=row['review_description'],
                metadata={'index': idx}
            )
            documents.append(doc)
        
        # Create vector store
        with tempfile.TemporaryDirectory() as temp_dir:
            vector_store = Chroma.from_documents(
                documents=documents,
                embedding=embeddings,
                persist_directory=temp_dir
            )
            
            # Search with scores
            results = vector_store.similarity_search_with_score(aspect, k=len(documents))
            
            # Filter and extract reviews
            relevant_reviews = []
            for doc, score in results:
                similarity = max(0, 2 - score)  # Convert distance to similarity
                if similarity >= similarity_threshold:
                    relevant_reviews.append(doc.page_content)
            
            return relevant_reviews
            
    except Exception as e:
        print(f"Semantic search error: {e}")
        return []

def scrape_english_reviews_with_aspects(
    app_ids,
    num_reviews=100,
    lang='en',
    country='us',
    output_file="english_app_reviews_with_aspects.csv",
    versions=None  # New parameter: list of versions or None
):
    combined_reviews = []

    for app_id in app_ids:
        try:
            print(f"Fetching reviews for app: {app_id}")
            reviews = reviews_all(
                app_id,
                sleep_milliseconds=0,
                lang=lang,
                country=country,
                sort=Sort.NEWEST,
                count=num_reviews
            )

            df = pd.DataFrame(reviews)

            # Remove empty or whitespace-only reviews
            df = df[df['content'].apply(lambda x: isinstance(x, str) and x.strip() != '')]

            # Keep only required columns
            keep_cols = ['score', 'reviewId', 'content', 'at', 'reviewCreatedVersion']
            df = df[keep_cols]

            # Rename columns
            df.rename(
                columns={
                    'score': 'rating',
                    'reviewId': 'review_id',
                    'content': 'review_description',
                    'at': 'review_date',
                    'reviewCreatedVersion': 'review_created_version',
                },
                inplace=True,
            )

            # Remove rows with null review_description
            df = df[df['review_description'].notnull()]

            # Remove reviews with short text (<= 10 chars or <= 2 words)
            df = df[df['review_description'].apply(lambda x: len(x.strip()) > 10 and len(x.strip().split()) > 2)]

            # Filter by version
            if versions is not None and isinstance(versions, list) and len(versions) > 0:
                df = df[df['review_created_version'].isin(versions)]
            else:
                # If no versions given, get latest 5 official changelog versions from AppBrain
                changelog_versions = get_app_changelog_versions(app_id)
                # Remove duplicates, preserve order, and get top 5
                latest_versions = list(dict.fromkeys(changelog_versions))[:3]
                print(f"Latest versions for {app_id}: {latest_versions}")
                df = df[df['review_created_version'].isin(latest_versions)]

            # Detect the language of each review
            print("Detecting languages...")
            df['language'] = df['review_description'].apply(detect_language)

            # Keep only English reviews
            df = df[df['language'] == 'en']

            # Drop the language column for final output
            df = df.drop(columns=['language'])

            if not df.empty:
                combined_reviews.append(df)
                df.to_csv(f"{app_id}_reviews.csv", index=False)
        except Exception as e:
            print(f"Error fetching reviews for {app_id}: {e}")

    # Combine all reviews into a single DataFrame and save
    if combined_reviews:
        combined_df = pd.concat(combined_reviews, ignore_index=True)
        combined_df.to_csv(output_file, index=False)
        print(f"English reviews with aspects saved to {output_file}")
    else:
        print("No English reviews were fetched.")

def filter_negative_aspects(absa_json):
    """
    Remove positive sentiment aspects, keep only negative/neutral.
    Expects absa_json to have an "aspects" key with a list of dicts,
    each dict containing a sentiment field (case-insensitive).
    """
    if 'aspects' not in absa_json:
        return absa_json

    def get_sentiment(aspect_dict):
        # Get the sentiment value from the nested dict
        for value in aspect_dict.values():
            if isinstance(value, dict) and 'sentiment' in value:
                return value['sentiment'].lower()
        return None

    filtered = [
        aspect for aspect in absa_json['aspects']
        if get_sentiment(aspect) in ['negative', 'neutral']
    ]
    absa_json['aspects'] = filtered
    return absa_json


if __name__ == "__main__":
    app_ids = [
        "com.a10minuteschool.tenminuteschool"
    ]
    # Example usage:
    #scrape_english_reviews_with_aspects(app_ids, versions=["3.9.8.4"])
    #scrape_english_reviews_with_aspects(app_ids)# Example usage
    
    
    # app_id = "com.a10minuteschool.tenminuteschool"
    # versions = get_app_changelog_versions(app_id)
    # print("Changelog versions (latest first):")
    # for v in versions:
    #     print(v)


    # reviews_df=pd.read_csv("english_app_reviews_with_aspects.csv")  # Load your reviews data
    # reviews_df = reviews_df[reviews_df['review_description'].notna()]  # Ensure no NaN values
    # print(len(reviews_df))
    # # Test multiple aspects
    # print("=== LOGIN AND PAYMENT ISSUES ===")
    # multi_reviews = filter_reviews_by_aspect(reviews_df, "login and payment problems")
    # print(f"Found {len(multi_reviews)} relevant reviews:")
    # for i, review in enumerate(multi_reviews[:], 1):
    #     print(f"{i}. {review}")
    
        
    # Test simple method
    # filtered_reviews = check_review_relevance_llm_json(reviews, app_info, recent_changes)
    
    # print(f"\n🎯 Final Relevant Reviews ({len(filtered_reviews)}):")
    # for i, review in enumerate(filtered_reviews, 1):
    #     print(f"{i}. {review}")

    print("🚀 Starting ABSA system...")
    if initialize_absa_model():
        print("🎉 ABSA model ready for inference!")
        
        # Test inference
        test_review = "love uno truly one favorite card game think good decision made limit game 3 minute piece however two complaint ad economy game rife ad legitimately every match there ad unless pay play ad free second uno need economy game requires coin play lose lose even coin run cant play unless buy coin watch ad"
        
        print("\n🧪 Testing inference...")
        result = infer_review_with_local_model(test_review)
        print(f"Result: {result}")
        
        # Test multiple inferences without reloading
        print("\n🧪 Testing multiple inferences...")
        for i in range(3):
            print(f"\nTest {i+1}:")
            result = infer_review_with_local_model(test_review)
            print(f"Result: {result}")
        
        # Optional: Cleanup when done
        model_manager.unload_model()
    else:
        print("❌ Failed to initialize ABSA model")

