import numpy as np
import pandas as pd


def normalize_by_view(data, view_splits):
    start = 0
    normalized_parts = []
    for size in view_splits:
        part = data[:, start:start + size].astype(float)
        part_max = np.max(part, axis=0)
        part_min = np.min(part, axis=0)
        part_range = part_max - part_min
        part_range[part_range == 0] = 1e-6
        part_norm = (part - part_min) / part_range
        normalized_parts.append(part_norm)
        start += size
    return np.concatenate(normalized_parts, axis=1)


def inject_anomalies_with_flags(
    data,
    replace_percentage,
    swap_percentage,
    swap_replace_percentage,
    drift_scale,
    view_splits,
    random_seed=42
):
    np.random.seed(random_seed)
    num_rows, num_cols = data.shape
    num_views = len(view_splits)
    view_bounds = np.cumsum([0] + view_splits)
    flags = np.zeros(num_rows, dtype=int)
    is_anomalous = np.zeros(num_rows, dtype=bool)

    num_attr = int(num_rows * replace_percentage)
    if num_attr > 0:
        attr_candidates = np.where(~is_anomalous)[0]
        attr_indices = np.random.choice(attr_candidates, num_attr, replace=False)
        for idx in attr_indices:
            for v in range(num_views):
                s, e = view_bounds[v], view_bounds[v + 1]
                data[idx, s:e] = np.random.uniform(0.0, 1.0, size=e - s)
            flags[idx] = 1
            is_anomalous[idx] = True

    num_class = int(num_rows * swap_percentage)
    if num_class >= 2:
        class_candidates = np.where(~is_anomalous)[0]
        np.random.shuffle(class_candidates)
        take = (num_class // 2) * 2
        if take > len(class_candidates):
            take = len(class_candidates) // 2 * 2
        pairs = class_candidates[:take].reshape(-1, 2)
        for i, j in pairs:
            is_anomalous[i] = is_anomalous[j] = True
            swap_count = max(1, num_views // 2)
            chosen_views = np.random.choice(num_views, swap_count, replace=False)
            for v in chosen_views:
                s, e = view_bounds[v], view_bounds[v + 1]
                data[i, s:e], data[j, s:e] = data[j, s:e].copy(), data[i, s:e].copy()
            flags[i] = flags[j] = 2

    num_mix = int(num_rows * swap_replace_percentage)
    if num_mix >= 2:
        mix_candidates = np.where(~is_anomalous)[0]
        np.random.shuffle(mix_candidates)
        take = (num_mix // 2) * 2
        if take > len(mix_candidates):
            take = len(mix_candidates) // 2 * 2
        pairs = mix_candidates[:take].reshape(-1, 2)
        for i, j in pairs:
            is_anomalous[i] = is_anomalous[j] = True
            V = num_views
            if V == 2:
                noise_views = [0]
                swap_views = [1]
            elif V == 3:
                noise_views = [0]
                swap_views = [1]
            else:
                all_views = np.arange(V)
                noise_count = max(1, V // 2)
                swap_count = max(1, V // 4)
                noise_views = np.random.choice(all_views, noise_count, replace=False)
                remaining = [v for v in all_views if v not in noise_views]
                swap_views = np.random.choice(remaining, swap_count, replace=False)
            for v in noise_views:
                s, e = view_bounds[v], view_bounds[v + 1]
                data[i, s:e] = np.random.uniform(0.0, 1.0, size=e - s)
                data[j, s:e] = np.random.uniform(0.0, 1.0, size=e - s)
            for v in swap_views:
                s, e = view_bounds[v], view_bounds[v + 1]
                data[i, s:e], data[j, s:e] = data[j, s:e].copy(), data[i, s:e].copy()
            flags[i] = flags[j] = 3

    return data, flags


if __name__ == "__main__":
    input_csv = r''
    drift_scale = 2.5
    random_seed = 42
    view_splits = [21531, 24893, 34279, 15506, 11519]

    ratios = [
        [0.02, 0.05, 0.08],
        [0.02, 0.08, 0.05],
        [0.05, 0.02, 0.08],
        [0.05, 0.08, 0.02],
        [0.08, 0.02, 0.05],
        [0.08, 0.05, 0.02]
    ]

    print("standing by...")
    df = pd.read_csv(input_csv, header=None)
    raw = df.values.astype(float)
    print("data shape:", raw.shape)
    normalized_original = normalize_by_view(raw, view_splits)

    for idx, (replace_percentage, swap_percentage, swap_replace_percentage) in enumerate(ratios, start=1):
        output_csv = f'Reuters_{int(replace_percentage*100)}_{int(swap_percentage*100)}_{int(swap_replace_percentage*100)}.csv'
        output_npz = f'Reuters_{int(replace_percentage*100)}_{int(swap_percentage*100)}_{int(swap_replace_percentage*100)}.npz'

        print(f"\n=== on {idx} ratio : (replace={replace_percentage}, swap={swap_percentage}, mix={swap_replace_percentage}) ===")
        print(f"injection（seed={random_seed}）...")

        anomalous_normalized, flags = inject_anomalies_with_flags(
            normalized_original.copy(),
            replace_percentage,
            swap_percentage,
            swap_replace_percentage,
            drift_scale,
            view_splits,
            random_seed=random_seed
        )

        binary_flags = (flags > 0).astype(int)
        final_data = np.column_stack((anomalous_normalized, binary_flags))

        pd.DataFrame(final_data).to_csv(output_csv, header=False, index=False)
        np.savez(output_npz, data=final_data, anomaly_type=flags)

        print("save complete：", output_csv, output_npz)
        print("view split:", view_splits)
        print("num:", len(raw))
        print("attribute outlier num:", np.sum(flags == 1))
        print("class outlier num:", np.sum(flags == 2))
        print("attribute-class outlier num:", np.sum(flags == 3))
        print("total outlier num:", np.sum(binary_flags), f"({100*np.mean(binary_flags):.2f}%)")

