import pandas as pd
from collections import Counter
import re
import os 
CORELOGIC_DATA_PATH = os.environ.get("CORELOGIC_DATA_PATH", None)
if CORELOGIC_DATA_PATH is None:
    raise ValueError("CORELOGIC_DATA_PATH is not set")

# File paths
nr_zips = 4
input_file = os.path.join(CORELOGIC_DATA_PATH, "origination_data1.csv")  # origination_data.csv
output_file = os.path.join(CORELOGIC_DATA_PATH, "filtered_origination_data_top_{}_zips.csv".format(nr_zips))


# List of zip codes to exclude
zip_codes_to_exclude = ["80015", "80013"]

# Step 1: Count occurrences of 'property_zip'
zip_counts = Counter()
chunk_size = 10**6  # Read 1 million rows at a time
total_rows = 0
i = 0

for chunk in pd.read_csv(input_file, usecols=["property_zip"], chunksize=chunk_size, low_memory=False):
    print("Chunk: ", i)
    zip_counts.update(chunk["property_zip"].value_counts().to_dict())
    total_rows += len(chunk)
    i += 1

# Step 2: Get the most common zip codes, excluding specified ones
most_common_zips_with_counts = [
    (zip_code, count) for zip_code, count in zip_counts.most_common()
    if zip_code not in zip_codes_to_exclude
][:nr_zips]
most_common_zips = [zip_code for zip_code, _ in most_common_zips_with_counts]

# Print the most common zip codes and their counts
print(f"{nr_zips} Most Common Zip Codes (Excluding Specified Ones) and Their Counts:")
for zip_code, count in most_common_zips_with_counts:
    print(f"Zip Code: {zip_code}, Count: {count}")

# Step 3: Filter data for these zip codes and save
filtered_rows = 0

with pd.read_csv(input_file, chunksize=chunk_size, low_memory=False) as reader:
    for chunk in reader:
        # Ensure only numeric zip codes are considered
        filtered_chunk = chunk[chunk["property_zip"].apply(lambda x: bool(re.fullmatch(r"\d+", str(x))))]
        filtered_chunk = filtered_chunk[
            filtered_chunk["property_zip"].isin(most_common_zips)
        ]
        filtered_rows += len(filtered_chunk)
        # Append the filtered data to the output file
        filtered_chunk.to_csv(output_file, mode='a', index=False, header=not bool(filtered_chunk.empty))

# Print the original and filtered row counts
print(f"\nOriginal number of rows: {total_rows}")
print(f"Filtered number of rows: {filtered_rows}")