import os
import sys
import argparse
import json
import subprocess
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm
import shutil
import re
from pathlib import Path

# Add the parent directory to sys.path to enable imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import (
    start_docker_containers, 
    stop_docker_containers,
    create_docker_compose_file,
    convert_windows_path_to_linux,
    llm_generation,
)

db_test_prompt = """You are an expert in understanding website generation instructions. You will be given a website generation instruction. Your task is to generate a list of data structures that should be present in the database to satisfy the requirements of the website generation instruction.

Requirements:
- A data structure should be presented as a natural language description of the table. The table description should describe what purpose the table serves.
- You should NEVER specify any names for the data structures. Instead, only describe their purpose and what data they should hold.
- You should ONLY consider the data structures that are absolutely necessary to fulfill the requirements of the website generation instruction and the test cases. If a data structure is not necessary, do not include it in your output.
- You should first think step by step about what data structures are needed to fulfill the requirements of the website generation instruction and the test cases. Output your thinking process. Then, you should output the list of data structures in JSON format as shown in the example below.

Example:

Instruction: Create a blogging platform where users can register, create profiles, write blog posts, and comment on posts.

Thinking Process:
[several lines of reasoning about what data structures are needed]

Data Structures:
```json
{{
    "data_structures": [
        "a table that stores user information",
        "a table that stores blog posts",
        "a table that stores comments on blog posts"
    ]
}}
```

Now, based on the above requirements and example, please generate the list of data structures for the following instruction:

Instruction: {instruction}"""

# ----------------------------- #
# Helpers for JSONL I/O
# ----------------------------- #
def load_jsonl(path: str):
    """Return a list with one dict per line in the JSONL file."""
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def dump_jsonl(objs, path: str):
    """Write a list of dicts to a JSONL file."""
    with open(path, "w", encoding="utf-8") as f:
        for obj in objs:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

# ----------------------------- #
# Parsing the model output
# ----------------------------- #
_JSON_BLOCK_RE = re.compile(r"```json(.*?)```", re.S)

def extract_data_structures(raw_text: str):
    """
    Extract the list of data structures from the model's raw response.
    Returns [] if nothing could be parsed.
    """
    # 1) Prefer the fenced ```json ... ``` block
    match = _JSON_BLOCK_RE.search(raw_text)
    json_src = match.group(1).strip() if match else raw_text

    try:
        parsed = json.loads(json_src)
        lst = parsed.get("data_structures", [])
        if isinstance(lst, list):
            return lst
    except json.JSONDecodeError:
        pass  # fall through to next attempt

    # 2) Fallback: try to find a list-like string manually
    try:
        parsed = json.loads(re.search(r"\{.*\}", raw_text, re.S).group())
        return parsed.get("data_structures", [])
    except Exception:
        return []  # give up

# ----------------------------- #
# Core generation wrapper
# ----------------------------- #
def generate_db_tests(instruction: str, num_tests: int = 5):
    """Return the list of data-structure descriptions for one instruction."""
    prompt = db_test_prompt.format(instruction=instruction)
    messages = [{"role": "user", "content": prompt}]

    response = llm_generation(
        messages,
        model="Qwen3-Coder-480B-A35B-Instruct-FP8"
    )
    raw_content = response.get("content", "")
    return extract_data_structures(raw_content)  # <- list[str]

# ----------------------------- #
# Per-sample worker
# ----------------------------- #
def process_sample(sample):
    inst = sample.get("instruction", "")
    data_structures = generate_db_tests(inst)
    # keep everything else untouched
    sample["data_structures"] = data_structures
    return sample

# ----------------------------- #
# Main routine
# ----------------------------- #
def main():
    input_path  = "datasets/WebGen-Bench.jsonl"
    output_path = "src/generate_fullstack_tests/test_db_model-gen.json"

    test_datas = load_jsonl(input_path)

    results = []
    max_workers = min(32, multiprocessing.cpu_count() * 2)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_sample, item): i for i, item in enumerate(test_datas)}
        for fut in tqdm(as_completed(futures), total=len(test_datas), desc="Generating DB models"):
            try:
                results.append(fut.result())
            except Exception as e:
                # If a task crashes, still save its id with empty data_structures
                idx = futures[fut]
                faulty = test_datas[idx]
                faulty["data_structures"] = []
                results.append(faulty)
                print(f"[WARN] Generation failed for id={faulty.get('id')}: {e}")

    # Preserve original ordering
    results.sort(key=lambda d: d.get("id", ""))

    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    dump_jsonl(results, output_path)
    print(f"Saved {len(results)} items to {output_path}")

# ----------------------------- #
if __name__ == "__main__":
    main()