import argparse
import os

from pathlib import Path

import cellxgene_census
import pandas as pd

from tqdm import tqdm


def to_str_list(values):
    if isinstance(values, str):
        values = [values]
    return "[" + ", ".join([f'"{str(v).replace("\"", "\\\"")}"' for v in values]) + "]"


def to_num_list(xs):
    return "[" + ", ".join(str(int(v)) for v in xs) + "]"


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--in_dir", required=True)
    p.add_argument("--out_dir", required=True)
    args = p.parse_args()

    TISSUES_ALL = ["lung", "blood", "eye", "breast", "heart", "brain"]
    task_id = os.environ.get("SLURM_ARRAY_TASK_ID")
    tissues = [TISSUES_ALL[int(task_id)]] if task_id else TISSUES_ALL

    in_dir = Path(args.in_dir)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    census = cellxgene_census.open_soma(census_version="2025-01-30")
    try:
        for tissue in tqdm(tissues):
            try:
                out_path = out_dir / f"slice_{tissue}.h5ad"
                if out_path.exists():
                    print(f"Skipping tissue={tissue}, slice already exists.")
                    continue

                print(f"Slicing SOMA for tissue={tissue}...")
                hvg_path = in_dir / f"hvg_{tissue}.csv"
                hvg_idxs = pd.read_csv(hvg_path)["gene"].dropna().astype(str).tolist()
                obs_val_filt = (
                    "is_primary_data == True and "
                    'assay == "10x 3\' v3" and '
                    f"tissue_general in {to_str_list(tissue)} and "
                    "nnz >= 300 and "
                    "disease == 'normal'"
                )
                var_val_filt = f"soma_joinid in {to_num_list(hvg_idxs)}"
                adata = cellxgene_census.get_anndata(
                    census,
                    organism="Homo sapiens",
                    obs_value_filter=obs_val_filt,
                    var_value_filter=var_val_filt,
                )

                print(f"Writing anndata to disk...")
                adata.write_h5ad(str(out_path))

                print(f"Saved {out_path}")
            except Exception as e:
                print(f"Failed {tissue}: {e}")
    finally:
        census.close()


if __name__ == "__main__":
    main()
