import pandas as pd
import random
import json
from datetime import datetime
import argparse
from pathlib import Path


def main():
    parser = argparse.ArgumentParser(description="Remove adjacent duplicate node_code pairs from a JSONL file.")

    parser.add_argument('--input', type=str, required=True, help='Path to input JSONL file.')
    parser.add_argument('--output', type=str, required=True, help='Path to cleaned output JSONL file.')
    parser.add_argument('--log', type=str, required=True, help='Path to deletion record log JSONL file.')

    args = parser.parse_args()

    input_path = Path(args.input)
    output_path = Path(args.output)
    log_path = Path(args.log)

    # read data
    data = pd.read_json(input_path, lines=True)

    deleted_records = []
    delete_indices = set()

    for i in range(len(data) - 1):
        if data.loc[i, 'nodes_codes'] == data.loc[i + 1, 'nodes_codes']:
            if i in delete_indices:
                delete_indices.add(i + 1)
                deleted_records.append({
                    'index': i + 1,
                    'id': data.loc[i + 1, 'id'],
                    'filename': data.loc[i + 1, 'filename'],
                    'action': 'delete',
                    'reason': f'Adjacent repetition (line {i} already deleted)',
                    'timestamp': datetime.now().isoformat()
                })
            else:
                to_delete = i if random.randint(0, 1) == 0 else i + 1
                delete_indices.add(to_delete)
                deleted_records.append({
                    'index': to_delete,
                    'id': data.loc[to_delete, 'id'],
                    'filename': data.loc[to_delete, 'filename'],
                    'action': 'delete',
                    'reason': 'random delete repeat one',
                    'target': data.loc[to_delete, 'target'],
                    'timestamp': datetime.now().isoformat()
                })

    data_cleaned = data.drop(index=list(delete_indices)).reset_index(drop=True)

    #Save the cleaned data
    data_cleaned.to_json(output_path, orient='records', lines=True, force_ascii=False)

    # Save the deletion log
    with open(log_path, 'w') as f:
        for item in deleted_records:
            json_str = json.dumps(item, default=lambda o: str(o) if not isinstance(o, (int, float, str, bool)) else o)
            f.write(json_str + '\n')

    # Output statistics
    print(f"✅ Original data rows: {len(data)}")
    print(f"✅ Cleaned data rows: {len(data_cleaned)}")
    print(f"✅ Deleted rows: {len(deleted_records)}")
    print(f"✅ Cleaned data saved to: {output_path}")
    print(f"✅ Deletion log saved to: {log_path}")


if __name__ == "__main__":
    main()
