import random
import re

# Load name lists from files
def load_names(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

# Extract names from text_file (three consecutive words starting with first_names.txt)
def extract_names_from_text(text_file, first_names, middle_names, last_names):
    extracted_names = set()
    with open(text_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Split words using regex, preserving cases like 's
    words = re.findall(r"\b\w+(?:'\w+)?\b", text)
    
    for i in range(len(words) - 2):  # Ensure three consecutive words
        first = words[i]
        middle = words[i + 1]
        last = words[i + 2]
        if first in first_names and middle in middle_names and last in last_names:  # Check if first word is in first_names.txt    
            full_name = f"{first} {middle} {last}"
            extracted_names.add(full_name)
    
    return extracted_names

# Generate unique names
def generate_unique_names(first_names, middle_names, last_names, existing_names, count):
    generated_names = set(existing_names)  # Include existing names to avoid duplicates
    max_attempts = count * 10  # Prevent infinite loop
    attempts = 0

    while len(generated_names) < count + len(existing_names) and attempts < max_attempts:
        first = random.choice(first_names)
        middle = random.choice(middle_names)
        last = random.choice(last_names)
        full_name = f"{first} {middle} {last}"
        generated_names.add(full_name)
        attempts += 1
    
    # Remove existing names
    new_names = generated_names - existing_names
    return list(new_names)[:count]

# Save names to file
def save_names_to_file(names, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for name in names:
            f.write(f"{name}\n")

# Main program
def main():
    # File paths
    firstname_file = "data/first_names.txt"
    middlename_file = "data/middle_names.txt"
    lastname_file = "data/last_names.txt"
    text_file = "hallucinate_small/pretrain_perturbed4.txt"
    output_file = "hallucinate_small/new_person4.txt"
    
    # Load name lists
    first_names = load_names(firstname_file)
    middle_names = load_names(middlename_file)
    last_names = load_names(lastname_file)
    
    # Extract names from text_file
    existing_names = extract_names_from_text(text_file, set(first_names), set(middle_names), set(last_names))
    print(f"Extracted {len(existing_names)} names from text_file")
    
    # Generate 100,000 unique new names
    target_count = 100000
    new_names = generate_unique_names(first_names, middle_names, last_names, existing_names, target_count)
    
    # Check if enough names were generated
    if len(new_names) < target_count:
        print(f"Warning: Only generated {len(new_names)} names, possibly due to insufficient name combinations")
    else:
        print(f"Successfully generated {len(new_names)} new names")
    
    # Save to output_file
    save_names_to_file(new_names, output_file)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()