import os
import subprocess
import gzip
import pandas as pd

# ─── USER PARAMETERS ───────────────────────────────────────────────────────────
repo_url    = "https://github.com/icsdataset/hai.git"
version     = "hai-21.03"               # e.g. "hai-20.07", "hai-21.03", etc.
clone_dir   = "hai"                     # where to clone the repo
output_dir  = "/Users/home/Documents/naz/research_codes/uncert_prop/realworld_exp/hai_down1"  # change to your target folder

# 1. Clone the repo (if needed)
if not os.path.isdir(clone_dir):
    subprocess.run(["git", "clone", repo_url], check=True)

# 2. Prepare output folder
os.makedirs(output_dir, exist_ok=True)

# 3. Load & concatenate all nominal (train) files
train_files = ["train1.csv.gz", "train2.csv.gz", "train3.csv.gz"]
dfs = []
for fname in train_files:
    gz_path = os.path.join(clone_dir, version, fname)
    with gzip.open(gz_path, "rt") as f:
        dfs.append(pd.read_csv(f))
data = pd.concat(dfs, ignore_index=True)

# Save the concatenated data to CSV
output_path = os.path.join(output_dir, "concatenated_train.csv")
data.to_csv(output_path, index=False)
print(f"Saved concatenated data to {output_path}")