"""Regenerate train/val/test loadertxt files by filtering only existing images"""
__author__ = 'XYZ'

import os
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

from .core._log_ import logger
log = logger(__file__)

from .core.fwo import write_json
from .dataset import get_splits


def _validate_and_reindex(split_file, images_root, out_file):
  """
  Read an existing split loadertxt file, check if each image exists,
  and regenerate a clean loadertxt file with fresh indices.
  """
  valid_lines, all_lines = [], []
  with open(split_file, 'r') as f:
    lines = [l.strip() for l in f if l and not l.startswith("#")]

  for line in tqdm(lines, desc=f"checking {Path(split_file).stem}"):
    toks = line.split()
    if len(toks) < 2:
      continue

    ## format: "<id> <relative_path> <label>"
    _id, rel = toks[0], toks[1]
    label = toks[2] if len(toks) > 2 else "0"
    abs_path = os.path.join(images_root, rel)

    all_lines.append((rel, label, abs_path))
    if os.path.exists(abs_path):
      valid_lines.append((rel, label))

  ## Write with new indices
  with open(out_file, "w") as f:
    for idx, (rel, label) in enumerate(valid_lines):
      f.write(f"{idx}\t{rel}\t{label}\n")

  log.info(f"[regen_loadertxt] {split_file} → {out_file}, "
           f"kept {len(valid_lines)}/{len(all_lines)} lines.")

  return {
    "split_file": split_file,
    "out_file": out_file,
    "total": len(all_lines),
    "valid": len(valid_lines),
    "dropped": len(all_lines) - len(valid_lines),
  }


def process_splits(dataset, datasetcfg, splits, from_path, to_path, __dataset_root__):
  os.makedirs(to_path, exist_ok=True)
  splits_cfg = get_splits(dataset, datasetcfg, __dataset_root__)

  summary = {"dataset": dataset, "timestamp": datetime.now().strftime("%d%m%y_%H%M%S"), "splits": {}}
  for split in splits:
    split_key = f"{split}loadertxt"
    if split_key not in splits_cfg:
      log.warning(f"Split {split} not found in dataset config.")
      continue

    split_file = splits_cfg[split_key]
    images_root = splits_cfg["loadertxt"] if from_path is None else from_path
    out_file = os.path.join(to_path, Path(split_file).name)

    split_summary = _validate_and_reindex(split_file, images_root, out_file)
    summary["splits"][split] = split_summary

  write_json(os.path.join(to_path, f"{dataset}.regen.summary.json"), summary)
  return summary


def main(args):
  __dataset_root__ = os.getenv("__DATASET_ROOT__")
  summary = process_splits(
    dataset=args.dataset,
    datasetcfg=args.datasetcfg,
    splits=args.splits,
    from_path=args.from_path,
    to_path=args.to_path,
    __dataset_root__=__dataset_root__,
  )
  log.info(f"Summary saved to {args.to_path}")
  return summary


def parse_args(**kwargs):
  import argparse
  parser = argparse.ArgumentParser(description="Regenerate valid loadertxt files")

  ## dataset
  parser.add_argument("--dataset", required=True, type=str, help="Dataset key in config")
  parser.add_argument("--datasetcfg", required=True, type=str, help="Dataset config .yml path")
  parser.add_argument("--splits", nargs="+", default=["train","val","test"])

  ## paths
  parser.add_argument("--from", dest="from_path", type=str, help="Override images root (default: from dataset config)")
  timestamp = datetime.now().strftime("%d%m%y_%H%M%S")
  default_to = os.path.join("logs", f"regen_loadertxt-{timestamp}")
  parser.add_argument("--to", dest="to_path", type=str, default=default_to,
                      help="Destination directory for regenerated loadertxt files")

  return parser.parse_args()


def print_args(args):
  log.info("Arguments:")
  for arg in vars(args):
    log.info(f"{arg}: {getattr(args, arg)}")


if __name__ == "__main__":
  args = parse_args()
  print_args(args)
  main(args)
