#%%
import os
import json
import cv2
import numpy as np
from PIL import Image

# Config
json_path = "downloads/cocotext.v2.json"
img_dir = "downloads/train2014"
resize_size = (384, 384)
area_threshold = 1500
output_dir = f"cocotext_bboxes_area_2_{area_threshold}"
os.makedirs(output_dir, exist_ok=True)
image_output_dir = os.path.join(output_dir, "image")
os.makedirs(image_output_dir, exist_ok=True)

# Load annotations
with open(json_path, 'r') as f:
    data = json.load(f)

filtered_anns = [
    ann for ann in data["anns"].values()
    if (
        ann["legibility"] == "legible"                      # legible text
        and ann["language"] == "english"                    # English only
        and ann.get("area", 0) > 0                          # positive area
        and ann.get("utf8_string", "").strip() != ""        # non-empty utf8_string
    )
]

records = []

for ann in filtered_anns:
    image_id = ann["image_id"]
    image_file = f"COCO_train2014_{image_id:012d}.jpg"
    image_path = os.path.join(img_dir, image_file)
    if not os.path.exists(image_path):
        continue

    try:
        with Image.open(image_path) as img:
            w_orig, h_orig = img.size
            scale_w = resize_size[0] / w_orig
            scale_h = resize_size[1] / h_orig
    except:
        continue

    scaled_area = ann["area"] * scale_w * scale_h
    if scaled_area < area_threshold:
        continue

    bbox = ann["bbox"]
    x, y, w, h = map(int, bbox)
    scaled_bbox = [bbox[0]*scale_w, bbox[1]*scale_h, bbox[2]*scale_w, bbox[3]*scale_h]
    word = ann.get("utf8_string", "").strip()

    img_cv = cv2.imread(image_path)
    if img_cv is None:
        continue

    # Expand bbox a bit to avoid blocking the text
    pad = 2
    x1, y1 = max(x - pad, 0), max(y - pad, 0)
    x2, y2 = min(x + w + pad, img_cv.shape[1] - 1), min(y + h + pad, img_cv.shape[0] - 1)

    annotated_img = img_cv.copy()
    cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 0, 255), thickness=2)

    output_img_path = os.path.join(image_output_dir, f"{image_id}_{ann['id']}.jpg")
    cv2.imwrite(output_img_path, annotated_img)

    records.append({
        "image_path": output_img_path,
        "word": word,
        "area": round(scaled_area, 2)
    })

# Write JSONL
with open(os.path.join(output_dir, "metadata_no_none.json"), "w") as f:
    json.dump(records, f, indent=2)

print(f"Total qualifying annotations (area > {area_threshold}): {len(records)}")

#%%
import json
import os
import uuid
import shutil
import random

input_json_path = "cocotext_bboxes_area_1500/metadata_no_none.json"
source_image_root = "cocotext_bboxes_area_1500/image"
output_root = "coco_text"
random.seed(42)

train_img_dir = os.path.join(output_root, "train/images")
val_img_dir = os.path.join(output_root, "val/images")
os.makedirs(train_img_dir, exist_ok=True)
os.makedirs(val_img_dir, exist_ok=True)

with open(input_json_path, "r") as f:
    all_data = json.load(f)

random.shuffle(all_data)
split_index = int(0.8 * len(all_data))
train_data = all_data[:split_index]
val_data = all_data[split_index:]

train_json = []
val_json = []
val_ans_json = []
missing_images = []

for item in train_data:
    img_path = os.path.abspath(item["image_path"])
    new_id = str(uuid.uuid4())
    new_img_name = new_id + ".jpg"
    dest_path = os.path.join(train_img_dir, new_img_name)

    if os.path.exists(img_path):
        shutil.copy(img_path, dest_path)
        train_json.append({
            "id": new_id,
            "image": f"coco_text/train/images/{new_img_name}",
            "conversations": [
                {
                    "from": "human",
                    "value": "<image>\nwhat is written in the red bounding box in the image?"
                },
                {
                    "from": "gpt",
                    "value": item["word"]
                }
            ]
        })
    else:
        missing_images.append(img_path)

for item in val_data:
    img_path = os.path.abspath(item["image_path"])
    new_id = str(uuid.uuid4())
    new_img_name = new_id + ".jpg"
    dest_path = os.path.join(val_img_dir, new_img_name)

    if os.path.exists(img_path):
        shutil.copy(img_path, dest_path)
        val_json.append({
            "question_id": new_id,
            "image": new_img_name,
            "category": "default",
            "text": "<image>\nwhat is written in the red bounding box in the image?",
            "id": new_id
        })
        val_ans_json.append({
            "question_id": new_id,
            "prompt": "<image>\nwhat is written in the red bounding box in the image?",
            "text": item["word"],
            "answer_id": None,
            "model_id": None,
            "metadata": {}
        })
    else:
        missing_images.append(img_path)

with open(os.path.join(output_root, "train/train.json"), "w") as f:
    json.dump(train_json, f, indent=2)

with open(os.path.join(output_root, "val/val.json"), "w") as f:
    json.dump(val_json, f, indent=2)

with open(os.path.join(output_root, "val/val_ans.json"), "w") as f:
    json.dump(val_ans_json, f, indent=2)

if missing_images:
    with open("missing_images.txt", "w") as f:
        for path in missing_images:
            f.write(path + "\n")


#%%
import json

file2 = "cocotext_bboxes_area_1500/metadata.json"      
file1 = "cocotext_bboxes_area_2_1500/metadata_no_none.json"   

with open(file1, "r", encoding="utf-8") as f:
    data1 = json.load(f)     # list[dict]

with open(file2, "r", encoding="utf-8") as f:
    data2 = json.load(f)

filenames_2 = {os.path.basename(item["image_path"]) for item in data2}

unique_to_file1 = [
    item for item in data1
    if os.path.basename(item["image_path"]) not in filenames_2
]
output_file = "unique_image_filenames_in_file1.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(unique_to_file1, f, ensure_ascii=False, indent=2)
