import argparse
import json
import logging
from pathlib import Path
from typing import Optional
"""
remove_useless.py
Read sequence_numbers.jsonl (one JSON object per line with a "sequence" field)
and remove matching .h5 files from the data/water directory.
Defaults (can be overridden with CLI args):
    jsonl: /workspaces/Jeff/Isaac-GR00T/sequence_numbers.jsonl
    data_dir: /workspaces/Jeff/Isaac-GR00T/data/water
"""
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
def load_sequences(jsonl_path: Path) -> list[str]:
        sequences = []
        if not jsonl_path.is_file():
                raise FileNotFoundError(f"JSONL file not found: {jsonl_path}")
        with jsonl_path.open("r", encoding="utf-8") as f:
                for i, line in enumerate(f, start=1):
                        line = line.strip()
                        if not line:
                                continue
                        try:
                                obj = json.loads(line)
                        except json.JSONDecodeError:
                                logging.warning("Skipping malformed JSON on line %d", i)
                                continue
                        seq = obj.get("sequence")
                        if seq is None:
                                logging.warning("No 'sequence' field on line %d; skipping", i)
                                continue
                        sequences.append(str(seq))
        return sequences
def remove_files(sequences: list[str], data_dir: Path, ext: str = ".h5", dry_run: bool = True) -> tuple[int, int, int]:
        removed = 0
        missing = 0
        errors = 0
        ext = ext if ext.startswith(".") else f".{ext}"
        for seq in sequences:
                filename = f"{seq}{ext}"
                target = data_dir / filename
                if target.exists():
                        if dry_run:
                                logging.info("[dry-run] would remove: %s", target)
                                removed += 1
                        else:
                                try:
                                        target.unlink()
                                        logging.info("Removed: %s", target)
                                        removed += 1
                                except Exception as e:
                                        logging.error("Failed to remove %s: %s", target, e)
                                        errors += 1
                else:
                        logging.warning("Missing file: %s", target)
                        missing += 1
        return removed, missing, errors
def main(jsonl: Optional[str], data_dir: Optional[str], ext: str, dry_run: bool):
        jsonl_path = Path(jsonl) if jsonl else Path("/workspaces/Jeff/Isaac-GR00T/sequence_numbers.jsonl")
        data_dir_path = Path(data_dir) if data_dir else Path("/workspaces/Jeff/Isaac-GR00T/data/water")
        sequences = load_sequences(jsonl_path)
        if not sequences:
                logging.info("No sequences found in %s", jsonl_path)
                return
        logging.info("Loaded %d sequences; data directory: %s", len(sequences), data_dir_path)
        removed, missing, errors = remove_files(sequences, data_dir_path, ext=ext, dry_run=dry_run)
        logging.info("Done. removed=%d (dry-run=%s), missing=%d, errors=%d", removed, dry_run, missing, errors)
if __name__ == "__main__":
        parser = argparse.ArgumentParser(description="Remove .h5 files listed by sequence_numbers.jsonl")
        parser.add_argument("--jsonl", help="Path to JSONL file with sequence entries",
                                                default="/workspaces/Jeff/Isaac-GR00T/sequence_numbers.jsonl")
        parser.add_argument("--data-dir", help="Directory containing .h5 files",
                                                default="/workspaces/Jeff/Isaac-GR00T/data/water")
        parser.add_argument("--ext", help="File extension to remove (default: .h5)", default=".h5")
        parser.add_argument("--no-dry-run", dest="dry_run", action="store_false",
                                                help="Actually delete files instead of doing a dry-run")
        args = parser.parse_args()
        main(args.jsonl, args.data_dir, args.ext, args.dry_run)
