import json, base64
from openai import OpenAI

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

client = OpenAI(
    api_key="",
    base_url="",
)

def generate_multi_scale_caption(img_path):
    img_base64 = encode_image(img_path)
    config = {
        "global": {
            "name": "Global Scene Description",
            "focus": "overall scene and primary land use types",
            "prompt": (
                "Generate an overall caption and scene description for this remote sensing image. "
                "Focus on the primary land use types, general landscape characteristics, and basic scene attributes. "
                "Consider various possibilities including urban areas, agricultural land, forests, water bodies, "
                "industrial zones, residential areas, transportation infrastructure, natural landscapes, etc."
            ),
            "examples": [
                "What type of landscape is this?",
                "What are the main land use types in the image?",
                "What are the overall characteristics of this scene?"
            ]
        },
        "large_scale": {
            "name": "Large-scale Object Recognition",
            "focus": "major infrastructure and large geographical features",
            "prompt": (
                "Identify and describe large-scale objects and features in this remote sensing image. "
                "Focus on major infrastructure, large buildings, main roads, water bodies, terrain features, "
                "and other prominent large-scale elements. Consider various types of infrastructure such as "
                "airports, ports, industrial facilities, major transportation networks, large residential complexes, "
                "commercial centers, stadiums, bridges, dams, etc."
            ),
            "examples": [
                "Are there any major roads or highways?",
                "What large buildings or structures are present?",
                "Are there any significant water bodies?",
                "Are there obvious terrain features?"
            ]
        },
        "small_scale": {
            "name": "Small-scale Object Details",
            "focus": "detailed features and small objects",
            "prompt": (
                "Identify and describe small-scale objects and detailed features in this remote sensing image. "
                "Focus on specific small objects, texture patterns, vegetation details, vehicles, small buildings, "
                "and fine-grained features. Consider various small objects such as individual cars, trucks, boats, "
                "aircraft, small houses, trees, agricultural equipment, street furniture, parking lots, "
                "swimming pools, tennis courts, etc."
            ),
            "examples": [
                "What types of vehicles can be identified?",
                "What is the ground texture like?",
                "Are there small buildings or houses?",
                "What are the specific types of vegetation?"
            ]
        }
    }
    
    system_prompt = (
            f"You are a professional remote sensing image analysis expert specializing in {config['name']}.\n"
            f"Current task: {config['prompt']}\n\n"
            "Please generate a concise and accurate description (no more than 50 words) based on the image content. "
            "Respond in English."
        )
    
    user_prompt = f"Please perform {config['name']} on this remote sensing image, focusing on {config['focus']}."
    
    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
            ]}
        ]
    )
    return completion.choices[0].message.content

def generate_questions_referring(dialogue_history, candidates):
    max_candidates = 3
    list = []
    for path in candidates[:max_candidates]:
        if os.path.exists(path):
            list.append(encode_image(path))
    prompt_sys = (
        "You are a proficient question generator specialized in remote sensing imagery analysis. "
        "The user has a target image in mind but only provides textual descriptions. "
        "You do NOT see the target image. Instead, you see the top retrieved candidate images "
        "and the current textual query + dialogue history.\n\n"
        "Your task: Generate ONE concise, discriminative question that helps "
        "clarify what distinguishes the user's intended target from these candidates.\n\n"
        "Guidelines:\n"
        "1. Base your question ONLY on visible differences among the candidate images.\n"
        "2. Focus on remote sensing features: land use, infrastructure, object density, layout, texture.\n"
        "3. Avoid repeating prior questions.\n"
        "4. Do NOT assume content not visible in the candidates.\n"
        "5. Phrase questions that guide the user to specify missing discriminative attributes."
    )

    prompt_user = f"[Description]\n{dialogue_history[0]}\n[Dialogue]\n"

    for qa in dialogue_history[1:]:
        question, answer = qa.split('? ')
        prompt_user += f"Question: {question}?\nAnswer: {answer}\n"
    
    for i, base in enumerate(list):
        prompt_user.append({"type": "text", "text": f"[Candidate {i+1}]"})
        prompt_user.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base}"}})
        
    prompt_user += "\nNew Question: "

    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": prompt_sys},
            {"role": "user", "content": prompt_user}
        ]
    )
    return completion.choices[0].message.content

def generate_answer(img_path, query):
    img_base64 = encode_image(img_path)
    
    prompt = (
        "Provide a direct and concise answer to the question based on the image, "
        "without any analysis or irrelevant details.\n"
        f"Question: {query} Answer:"
    )

    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
            ]}
        ]
    )
    return completion.choices[0].message.content

def reconstruct_dialog(dial):
    caption = dial[0]
    dialog = ', '.join(dial[1:])

    dialog_examplar = ', '.join([
        "is there a runway? yes, a single long runway with parallel taxiways",
        "is it near water? no, surrounded by flat dry land",
        "are there buildings nearby? yes, several low-rise terminal buildings to the east",
        "is the area urban or rural? rural, with sparse infrastructure",
        "are there vehicles on the runway? no, but a few parked near terminals",
        "what is the dominant land cover? bare soil and sparse grass",
        "is there a control tower? yes, a small tower near the main terminal",
        "are there hangars? yes, two large metal hangars to the west",
        "is the airport large? no, it appears to be a small regional airport",
        "is there vegetation around? minimal, mostly dry and arid"
    ])

    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": (
                "Your role is to reconstruct the [Caption] using the additional information given in the [Dialogue]. "
                "The reconstructed [New Caption] should be concise and descriptive enough to retrieve the target image from candidates."
            )},
            {"role": "user", "content": f"[Caption]: an airport with a runway [Dialogue]: {dialog_examplar}  [New Caption]: "},
            {"role": "assistant", "content": (
                "a small regional airport in a rural arid area, featuring a single long runway with parallel taxiways, several low-rise terminal buildings and a small control tower to the east, two large metal hangars to the west, few vehicles parked near terminals, surrounded by bare soil and sparse dry grass with minimal vegetation"
            )},
            {"role": "user", "content": f"[Caption]: {caption} [Dialogue]: {dialog}  [New Caption]: "}
        ]
    )
    return completion.choices[0].message.content

def generate_atomic_instructions(dialogue_history):
    prompt = """
    Analyze the following dialogue and perform two steps:
    
    Step 1: Instruction Classification and Impact Analysis.
    Classify the given instruction into the following types and identify how it affects the reference image. For each type, determine the specific elements or attributes of the reference image that are impacted. The instruction types are:
    (1) Addition: Introduces new elements or features to the reference image. Identify which existing element the addition relates to or where it should be placed.
    (2) Removal: Eliminates certain elements from the reference image. Identify which existing element is removed.
    (3) Modification: Alters attributes of existing elements in the reference image. Determine which specific element is being modified and how.
    (4) Retention: Explicitly preserve components key to the query. Identify elements and attributes being compared.
    
    Step 2: Statement Sentence Conversion.
    Convert each atomic instruction into a statement sentence attributes.
    
    ------
    Example:
    User: Add an area with farmland that has a regular grid pattern.
    Assistant: The remote sensing image shows an urban area and its surroundings.
    
    Step 1: Based on the instruction:
    - Addition: Add farmland with a regular grid pattern.
    
    Step 2: Based on the atomic instructions, the statements are:
    - The image contains a farmland area
    - The farmland has a regular grid pattern
    - The farmland is located within the visible area of the image
    
    Dialogue content:
    {dialogue}
    
    Please return the results in JSON format, including the following fields:
    1. atomic_instructions: List of atomic instructions, each containing type and content
    2. attribution_sentences: List of converted statement sentences
    """
    
    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": "You are a professional dialogue analysis assistant, skilled at extracting instructions from dialogues and classifying them."},
            {"role": "user", "content": prompt.format(dialogue=dialogue_history)}
        ],
        response_format={"type": "json_object"}
    )
    return completion.choices[0].message.content

def extract_keywords_from_instructions(instructions_text):
    prompt = (
        "You are an expert in remote sensing imagery analysis. "
        "Extract ONLY concrete, visually observable keywords from the instructions below that are relevant for cross-modal image-text alignment in remote sensing.\n\n"
        
        "Focus on these categories:\n"
        "- Land cover / land use: e.g., forest, farmland, wetland, urban, desert, water body\n"
        "- Infrastructure: e.g., runway, highway, bridge, parking lot, solar panels, wind turbines, stadium\n"
        "- Objects & structures: e.g., vehicle, ship, airplane, building, greenhouse, silo, swimming pool\n"
        "- Spatial patterns: e.g., grid layout, circular pattern, dense clustering, scattered distribution\n"
        "- Materials & textures: e.g., concrete, asphalt, metal roof, bare soil, vegetation\n"
        "- Colors & spectral cues: e.g., green vegetation, blue water, gray roof, red soil (only if explicitly mentioned)\n\n"
        
        "DO NOT include:\n"
        "- Abstract terms: 'area', 'region', 'scene', 'presence', 'visible', 'clear'\n"
        "- Verbs or actions: 'add', 'remove', 'change'\n"
        "- Redundant modifiers: 'large', 'small', 'many' (unless part of a fixed term like 'small building')\n"
        "- General words with low discriminability\n\n"
        
        "Output as a JSON list of unique keywords, lowercase, no duplicates: [\"keyword1\", \"keyword2\", ...]\n\n"
        
        f"Instructions:\n{instructions_text}"
    )
    
    completion = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": "Extract visual keywords for remote sensing cross-modal alignment."},
            {"role": "user", "content": prompt}
        ]
    )
    
    keywords = json.loads(completion.choices[0].message.content)
    return keywords if isinstance(keywords, list) else []

with open('RS/dataset.json', 'r') as f:
    data = json.load(f)

start_from_image = ""
end_at_image = ""
started = False if start_from_image else True
    
for image_data in data['images']:
    if image_data['split'] != "train":
        continue
        
    img_path = image_data['filename']
    
    if not started:
        if img == start_from_image:
            started = True
            print(f"begin with: {img} ...")
        else:
            continue

    if end_at_image and img == end_at_image:
        print(f"end with {img} ...")
        break
        
    multi_scale_desc = generate_multi_scale_caption(img_path)
    
    for sentence in image_data['sentences']:
        original_caption = sentence['raw']
        dialogue_history = []
        
        for round_idx in range(5):
            candidates = top_k(current_caption, k=3)
            
            q = generate_questions_referring(dialogue_history, candidates)
            
            a = generate_answer(img_path, q)
            
            dialogue_history.append((q, a))
            
            new_caption = reconstruct_dialog(dialogue_history)
            current_caption = new_caption
            
            instructions = generate_atomic_instructions(dialogue_history)
            keywords = extract_keywords_from_instructions(instructions)
            
            record = {
                "img": img_path,
                "original_caption": original_caption,
                "multi_scale_desc": multi_scale_desc,
                "round": round_idx + 1,
                "dialogue_history": dialogue_history.copy(),
                "reconstructed_caption": new_caption,
                "atomic_instructions": instructions,
                "keywords": keywords
            }
            
            for file_path, key in [
                (updated_description_file, "dialogue_history"),
                (reconstructed_caption_file, "reconstructed_caption"),
                (far_instructions_file, "atomic_instructions")
            ]:
                try:
                    if os.path.exists(file_path):
                        with open(file_path, 'r', encoding='utf-8') as f:
                            records = json.load(f)
                    else:
                        records = []
                    records.append(record)
                    with open(file_path, 'w', encoding='utf-8') as f:
                        json.dump(records, f, ensure_ascii=False, indent=2)
                except Exception as e:
                    print(f"Save error: {e}")