import os
import subprocess
import gzip
import pandas as pd

# ─── USER PARAMETERS ───────────────────────────────────────────────────────────
repo_url    = "https://github.com/icsdataset/hai.git"
version     = "hai-21.03"               # e.g. "hai-20.07", "hai-21.03", etc.
clone_dir   = "hai"                     # where to clone the repo
output_dir  = "/Users/home/Documents/naz/research_codes/uncert_prop/realworld_exp/hai1"  # change to your target folder

# Fill these in with the exact column‐names from the HAI CSVs:
input_cols  = [ "MV101", "MV102", "MV103",  … ]  # your manipulated‐variable tags
output_cols = [ "PV201", "PV202", "PV203",  … ]  # your process‐variable tags
# ───────────────────────────────────────────────────────────────────────────────

# 1. Clone the repo (if needed)
if not os.path.isdir(clone_dir):
    subprocess.run(["git", "clone", repo_url], check=True)

# 2. Prepare output folder
os.makedirs(output_dir, exist_ok=True)

# 3. Load & concatenate all nominal (train) files
train_files = ["train1.csv.gz", "train2.csv.gz", "train3.csv.gz"]
dfs = []
for fname in train_files:
    gz_path = os.path.join(clone_dir, version, fname)
    with gzip.open(gz_path, "rt") as f:
        dfs.append(pd.read_csv(f))
data = pd.concat(dfs, ignore_index=True)

# 4. Split into U (inputs) and Y (outputs)
U = data[input_cols]
Y = data[output_cols]

# 5. Save to CSV
U.to_csv(os.path.join(output_dir, "U.csv"), index=False)
Y.to_csv(os.path.join(output_dir, "Y.csv"), index=False)

print(f"Saved U.csv and Y.csv in {output_dir}")
