"""Convert the original TSPLIB files into our numpy dataset format.

Only the instances with cities's explicit coordinates and with euclidean distances are used. I made
sure the final instances are the same as the ones specified in the paper from
[Fang et al., 2024](https://proceedings.mlr.press/v235/fang24c.html).
"""

from pathlib import Path

import numpy as np


def extract_name(content: str) -> str:
    start = content.find("NAME : ") + len("NAME : ")
    end = content.find("\n", start)
    name = content[start:end]
    name = name.split(".tsp")[0]
    return name


def extract_coords(content: str) -> np.ndarray:
    start = content.find("NODE_COORD_SECTION\n") + len("NODE_COORD_SECTION\n")
    end = content.find("EOF", start)
    coords = content[start:end]
    coords = coords.strip().split("\n")
    coords = [[c_ for c_ in c.split(" ") if c_ != ""] for c in coords]
    coords = [(float(c1.strip()), float(c2.strip())) for _, c1, c2 in coords]
    coords = np.array(coords)
    return coords


def read_instance(filepath: Path) -> tuple[str, np.ndarray]:
    content = filepath.read_text()
    name = extract_name(content)
    coords = extract_coords(content)
    return name, coords


def read_solutions(filepath: Path) -> dict[str, np.ndarray]:
    content = filepath.read_text()
    solutions = content.strip().split("\n")
    solutions = [sol.strip().split(" : ") for sol in solutions]
    solutions = {name: np.array(float(value)) for name, value in solutions}
    return solutions


def save_instances(
    instances: dict[str, tuple[np.ndarray, np.ndarray]],
    directory: Path,
):
    directory.mkdir(parents=False, exist_ok=True)

    for name, (coords, value) in instances.items():
        np.savez_compressed(
            f"{directory / name}.npz",
            coords=coords[None],
            tour_lens=value[None],
            allow_pickle=False,
        )


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--instances", type=Path, help="Directory containing the TSPLIB instances", required=True)
    parser.add_argument("--solutions", type=Path, help="File containing all solutions values", required=True)
    parser.add_argument("--output-dir", type=Path, help="Where to save the numpy instances", required=True)
    args = parser.parse_args()

    instances = dict()
    solutions = read_solutions(args.solutions)
    for filepath in args.instances.glob("*.tsp"):
        if "EDGE_WEIGHT_SECTION" in filepath.read_text():
            print(f"Ignoring instance {filepath}: edge weights")
            continue

        if "EUC_2D" not in filepath.read_text():
            print(f"Ignoring instance {filepath}: not EUC_2D distances")
            continue

        name, coords = read_instance(filepath)
        instances[name] = (coords, solutions[name])

    save_instances(instances, args.output_dir)
