import os
import csv
import re

def deduplicate_records(records):
    """
    Remove duplicate records from the dataset.
    
    Args:
        records: List of dictionaries containing the parsed records
        
    Returns:
        tuple: (deduplicated_records, num_duplicates_removed)
    """
    # Convert records to a hashable form for deduplication
    unique_records = {}
    duplicates_found = 0
    
    for record in records:
        # Create a tuple of all values to use as a unique key
        key = (
            record["instancename"],
            record["pruned"],
            record["k_exch"],
            record["k_adj"],
            record["k_rais"],
            record["k_repl"],
            record["min_errors"]
        )
        
        # If this key is new, add it to our unique records
        if key not in unique_records:
            unique_records[key] = record
        else:
            duplicates_found += 1
            
    # Convert back to a list
    deduplicated_records = list(unique_records.values())
    
    print(f"Deduplication complete: {duplicates_found} duplicate records removed")
    print(f"Original record count: {len(records)}")
    print(f"Deduplicated record count: {len(deduplicated_records)}")
    
    return deduplicated_records, duplicates_found

# Regular expression to parse file names
# Format: dataset_data-[unpru-]kadj#-kexch#-kreplinf[.timeout]
file_pattern = re.compile(r"^(?P<instancename>[^_]+)_data-(?P<unpru>unpru-)?(?P<params>.+?)(?:\.timeout)?$")

def process_files(directory="results/local-searcher"):
    """Process all data files in the directory and consolidate results."""
    # Output file path
    output_file = "results/consolidated_results.csv"
    
    # Collect all records
    records = []

    print("\nReading files...")

    # Process each file in the directory
    files = [f for f in os.listdir(directory) if not f.endswith(".timeout")]
    
    for filename in files:
        # Parse instancename and pruning status from filename
        match = file_pattern.match(filename)
        if not match:
            continue
        
        instancename = match.group("instancename")
        pruning_status = "unpruned" if match.group("unpru") else "pruned"
        
        filepath = os.path.join(directory, filename)
        
        # In a real scenario, we'd read the file. For this demonstration,
        # we're assuming the file structure described in the prompt
        # The code below simulates file reading, in real use it would be:
        try:
            with open(filepath, 'r') as f:
                # Skip header line
                next(f)
                
                for line in f:
                    parts = line.strip().split(',')
                    if len(parts) != 5:
                        continue
                    
                    # Extract parameter values
                    k_exch, k_adj, k_rais, k_repl, min_errors = parts
                    
                    # Create record with appropriate type conversions
                    record = {
                        "instancename": instancename,
                        "pruned": pruning_status,
                        "k_exch": int(k_exch),
                        "k_adj": int(k_adj),
                        "k_rais": int(k_rais),
                        "k_repl": k_repl if k_repl == "inf" else int(k_repl),
                        "min_errors": int(min_errors)
                    }
                    
                    records.append(record)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    print("\nPerforming consistency check...")
    # Perform consistency check
    check_consistency(records)

    # Before writing to CSV, deduplicate the records
    print("\nPerforming deduplication...")
    deduplicated_records, num_duplicates = deduplicate_records(records)
   
    # Write consolidated records to CSV
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        
        # Write header
        writer.writerow(["instancename", "pruned/unpruned tree", "k_exch", "k_adj", "k_rais", "k_repl", "min_errors"])
        
        # Write records
        for record in deduplicated_records:
            writer.writerow([
                record["instancename"],
                record["pruned"],
                record["k_exch"],
                record["k_adj"],
                record["k_rais"],
                record["k_repl"],
                record["min_errors"]
            ])
    
    print(f"Consolidated data written to {output_file}")
    

def check_consistency(records):
    """Check for records that are identical except for min_errors."""
    inconsistent_pairs = []
    
    for i in range(len(records)):
        for j in range(i+1, len(records)):
            record1 = records[i]
            record2 = records[j]
            
            # Check if records are the same except for min_errors
            if (record1["instancename"] == record2["instancename"] and
                record1["pruned"] == record2["pruned"] and
                record1["k_exch"] == record2["k_exch"] and
                record1["k_adj"] == record2["k_adj"] and
                record1["k_rais"] == record2["k_rais"] and
                str(record1["k_repl"]) == str(record2["k_repl"]) and
                record1["min_errors"] != record2["min_errors"]):
                inconsistent_pairs.append((record1, record2))
    
    if inconsistent_pairs:
        print("\nFound inconsistencies (records identical except for min_errors):")
        for record1, record2 in inconsistent_pairs:
            print(f"\nRecord 1: {record1}")
            print(f"Record 2: {record2}")
    else:
        print("\nNo inconsistencies found - all records are consistent.")

if __name__ == "__main__":
    process_files()
