# Copyright 2024. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example usage:
accelerate launch \
    --config_file=deepspeed_zero2.yaml \
    train_video_llm.py \
    --dataset_name mfarre/simplevideoshorts \
    --model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 4 \
    --output_dir video-llm-output \
    --bf16 \
    --torch_dtype bfloat16 \
    --gradient_checkpointing
"""
import numpy as np
import os
import json
import random
import requests
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
    Qwen2VLProcessor,
    Qwen2VLForConditionalGeneration,
    Qwen2_5_VLForConditionalGeneration
)
from trl import (
    ModelConfig,
    ScriptArguments,
    SFTConfig,
    SFTTrainer,
    TrlParser,
    get_kbit_device_map,
    get_peft_config,
)
from accelerate import Accelerator
from qwen_vl_utils import process_vision_info

from datasets import Dataset, DatasetDict
from trainer import SFTVLATrainer
from action_head import RegressionActionHead#, init_module
import wandb
import re
from typing import List, Dict, Any

def get_current_device():
    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"

def download_video(url: str, folder: str = '/tmp/videos/') -> str:
    """Download video if not already present locally."""
    filename = url.split("/")[-1]
    local_path = os.path.join(folder, filename)

    if os.path.exists(local_path):
        return local_path

    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        return local_path
    except requests.RequestException as e:
        raise Exception(f"Failed to download video: {e}")

def prepare_dataset(example: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
    """Prepare dataset example for training."""

    # import pdb; pdb.set_trace()

    system_message = "You are a helpful assistant"
    
    
    QUESTION_TEMPLATE = (
        "{Question}\n"
        "Please think deeply. "
        "Engage in an internal dialogue other natural language thought expressions "
        "It's a reasoning process. "
        "Provide your reasoning between the <think> </think> tags, and then give your answer between the <answer> </answer> tags."
    )

    TYPE_TEMPLATE = {
        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
        "action-detection": " Please provide your text answer within the <answer> </answer> tags.",
        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
        "driving":" Predicted future movement details for the next 5 seconds (sampled at 0.5-second intervals), including BEV location in x and y directions (in meters). Positive x means forward direction while positive y means leftwards. The output is formatted as [x, y]."
    }



    
    # if example["problem_type"] == 'multiple choice':
    #     question = example['problem'] + "Options:\n"
    #     for op in example["options"]:
    #         question += op + "\n"
    # else:
    #     question = example['problem']
    question = example["messages"][0]["content"]

    # process_driving should diff in each frame
    process_driving = str("<think>\nLet me think. To rephrase the question in a way that requires Chain-of-Thought reasoning with numerical or mathematical expressions, we should break down the prediction of future waypoints into smaller steps, starting from understanding the provided data and applying relevant physics equations. \n\nThe original question asks for predicting the future waypoints directly from the given vehicle status, but let's derive the waypoints through intermediate calculations. \n \nOh, I see. The question now needs to be framed in such a way that the responder understands they need.\n</think>")
    # pattern = r'\[(-?\d+\.\d+),\s*(-?\d+\.\d+)\]'
    # matches = re.findall(pattern, example["messages"][1]["content"])
    # solution = [(float(x), float(y)) for x, y in matches]
    # solution_driving = matches
    # solution_slice = example["messages"][1]["content"][-199:] 
    solution_slice = example["messages"][1]["content"].split("The output is formatted as [x, y]: ")[1]
    solution_slice_points = solution_slice.split("</PLANNING>")[0]
    
    # for check
    # import pdb; pdb.set_trace()
    if solution_slice_points.startswith("[") == False:
        print("************  the solution_slice_points not start with [ .*********************")
        return
    if solution_slice_points.endswith("]") == False:
        print("************  the solution_slice_points not end with ] .*********************")
        return
    pattern = r'\[(-?\d+\.\d+),\s*(-?\d+\.\d+)\]'
    matches = re.findall(pattern, solution_slice_points)
    x_y_num =np.array( [(float(x), float(y)) for x, y in matches])
    # print("str = ",solution_slice_points)
    # print("num = ",x_y_num)
    # import pdb; pdb.set_trace()
    if x_y_num.shape != (10,2) :
        print("************  the x_y_num  shape is incorrect.*********************")
        return
    # 替换x, y 为 <|repo_name|>  151663
    solution_slice_points = '<|repo_name|>'*20
    
    ## all
    max_ =np.array([[39.29, 13.83],
       [45.04, 18.37],
       [56.07, 17.71],
       [65.96, 23.66],
       [76.27, 31.19],
       [84.92, 39.04],
       [88.06, 47.63],
       [85.55, 54.28],
       [79.69, 60.4 ],
       [88.26, 71.45]])
    min_ = np.array([[-7.0000e-02, -8.3300e+00],
       [-1.2000e-01, -9.0100e+00],
       [-1.1000e-01, -1.2490e+01],
       [-7.6125e+02, -1.7390e+01],
       [-7.4843e+02, -2.3930e+01],
       [-7.3502e+02, -3.1000e+01],
       [-7.2195e+02, -3.2210e+01],
       [-7.0962e+02, -4.0350e+01],
       [-1.3500e+00, -4.8860e+01],
       [-1.3100e+00, -6.0780e+01]])
    ### 归一化
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image", #example['data_type'],
                
                    "image": "./data/all_vla/" + example['images'][0]
           
                },
                {
                    "type": "text",
                    "text": QUESTION_TEMPLATE.format(Question=question) +TYPE_TEMPLATE["driving"]  #TYPE_TEMPLATE[example['problem_type']]
                }
            ]
        },
        
        {
            "role": "assistant",
            # "content": [{"type": "text", "text": example['process'] + "\n" + example['solution']}]
            "content": [{"type": "text", "text": process_driving + "\n" +"<answer>" + solution_slice_points + "</answer>"}]
        },
        # {"actions":x_y_num/35.0}
        # {"actions":(x_y_num - mean_)/std_}
        {"actions":(x_y_num - min_)/(max_ - min_ + 1e-6)}
    ]
    # import pdb; pdb.set_trace()
    
    return {"messages": messages}

def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
    """Collate batch of examples for training."""
    texts = []
    # video_inputs = []
    # image_inputs = []

    for i, example in enumerate(examples):
        try:
            # import pdb; pdb.set_trace()
            action_dict = example["messages"].pop()
            # print("============",action_dict)
            action_values = action_dict['actions']
            if action_values.shape != (10,2) :
                print("************  the action_values  shape is incorrect.*********************")
                return

            texts.append(processor.apply_chat_template(example["messages"], tokenize=False))
            image_inputs, video_inputs, video_kwargs = process_vision_info(example["messages"], return_video_kwargs=True)

            # example["messages"]["actions"]= action_values
            example["messages"].append({"actions":action_values})
            # import pdb; pdb.set_trace()
            
            
        except Exception as e:
            raise ValueError(f"Failed to process example {i}: {e}")

    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt",
        padding=True
    )

    labels = inputs["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    # import pdb; pdb.set_trace()
    label_action_mask = labels == 151663

    # Handle visual tokens based on processor type
    visual_tokens = [151652, 151653, 151656] if isinstance(processor, Qwen2VLProcessor) else [
        processor.tokenizer.convert_tokens_to_ids(processor.image_token)
    ]

    for visual_token_id in visual_tokens:
        labels[labels == visual_token_id] = -100

    inputs["labels"] = labels
    # inputs["return_dict"]=True
    inputs["output_hidden_states"]=True
    inputs["label_action_mask"]=label_action_mask
    inputs["actions"]=action_values
    return inputs

if __name__ == "__main__":
    # Parse arguments
    # import pdb; pdb.set_trace()
    parser = TrlParser((ScriptArguments, SFTConfig, ModelConfig))
    script_args, training_args, model_config = parser.parse_args_and_config()
    
    # Configure training args
    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
    training_args.remove_unused_columns = False
    training_args.dataset_kwargs = {"skip_prepare_dataset": True}

    # Load dataset
    # import pdb; pdb.set_trace()
    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
    else:
        # Load the dataset
        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)

    # Setup model
    torch_dtype = (
        model_config.torch_dtype
        if model_config.torch_dtype in ["auto", None]
        else getattr(torch, model_config.torch_dtype)
    )

    # # Quantization configuration for 4-bit training
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    #     bnb_4bit_compute_dtype=torch.bfloat16
    # )

    # Model initialization
    model_kwargs = dict(
        revision=model_config.model_revision,
        trust_remote_code=model_config.trust_remote_code,
        torch_dtype=torch_dtype,
        device_map=get_kbit_device_map(),
        # quantization_config=bnb_config,
    )
    
    # import pdb; pdb.set_trace()
    if "Qwen2-VL" in model_config.model_name_or_path:
        model = Qwen2VLForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
    elif "Qwen2.5-VL" in model_config.model_name_or_path:
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
    else:
        model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)

    processor = AutoProcessor.from_pretrained(
        model_config.model_name_or_path,
        trust_remote_code=model_config.trust_remote_code
    )

    # add action head
    # if cfg.use_l1_regression:
    # action_head = init_module(
    #     L1RegressionActionHead,
    #     "action_head",
    #     cfg,
    #     device_id,
    #     {"input_dim": vla.module.llm_dim, "hidden_dim": vla.module.llm_dim, "action_dim": ACTION_DIM},
    #     to_bf16=True,
    # )
    action_head = RegressionActionHead(input_dim=3584,
        hidden_dim=3584,
        action_dim=2)
    model.action_head = action_head

    # Prepare dataset
    prepared_dataset = [prepare_dataset(example) for example in dataset['train']]

    # Initialize wandb if specified
    if training_args.report_to == "wandb":
        wandb.init(project="video-llm-training")
    # import pdb; pdb.set_trace()
    # Initialize trainer
    trainer = SFTVLATrainer(
        model=model,
        args=training_args,
        train_dataset=prepared_dataset,
        data_collator=collate_fn,
        peft_config=get_peft_config(model_config),
        # tokenizer=processor.tokenizer
    )

    # Train model
    trainer.train()

    # Save final model

    trainer.save_model(training_args.output_dir)
    processor.save_pretrained(training_args.output_dir)

    if trainer.accelerator.is_main_process:
        # Restore k,v cache for fast inference
        trainer.model.config.use_cache = True
        trainer.model.config.save_pretrained(training_args.output_dir)

    # Cleanup
    del model
    del trainer
    torch.cuda.empty_cache()
    wandb.finish()
