import pandas as pd
import json

# === Paths ===
csv_path = "/fs/scratch/PAS2099/Lemeng/Spatial_combined/fourth_master_label_mapping_table_corrected.csv"
json_path = "/fs/scratch/PAS2099/Lemeng/Spatial_combined/human_variant_check.json"
output_path = "fifth_master_label_mapping_table_corrected.csv"

# === Load files ===
df = pd.read_csv(csv_path)
with open(json_path, "r", encoding="utf-8") as f:
    variant_map = json.load(f)

# === Step 1: Normalize unified_label using mapping
df["unified_label"] = df["unified_label"].apply(lambda x: variant_map.get(x.strip(), x.strip()))

# === Step 2: Group by unified_label and merge unique values into lists
def merge_group(group):
    def merge_column(col):
        unique_vals = sorted(set(
            val.strip() for val in group[col] if pd.notna(val) and str(val).strip()
        ))
        return ", ".join(unique_vals)

    merged = {
        "unified_label": group.name,
        "object365_label": merge_column("object365_label"),
        "nyu_label": merge_column("nyu_label"),
        "lvis_label": merge_column("lvis_label"),
    }

    # Count how many fields are non-empty after merge
    non_empty_count = sum(bool(merged[col].strip()) for col in ["object365_label", "nyu_label", "lvis_label"])
    merged["match_type"] = "exact" if non_empty_count >= 2 else "unique"

    return pd.Series(merged)

# Apply merging logic
df_merged = df.groupby("unified_label").apply(merge_group).reset_index(drop=True)

# === Save result
df_merged.to_csv(output_path, index=False)
print(f"✅ Saved corrected and deduplicated file to: {output_path}")
