import pandas as pd
import os 
CORELOGIC_DATA_PATH = os.environ.get("CORELOGIC_DATA_PATH", None)
if CORELOGIC_DATA_PATH is None:
    raise ValueError("CORELOGIC_DATA_PATH is not set")

# File paths
nr_zips = 4
loan_ids_file = os.path.join(CORELOGIC_DATA_PATH, "top_{}_zips_loan_ids.csv".format(nr_zips))
performance_file = os.path.join(CORELOGIC_DATA_PATH, "performance_data1.csv")
filtered_output_file = os.path.join(CORELOGIC_DATA_PATH, "filtered_performance_data_top_{}_zips.csv".format(nr_zips))

# Step 1: Load the loan_id values into a set and handle the header properly
csv = pd.read_csv(loan_ids_file)
csv_values = (csv["loan_id"].values)
csv_ups = csv_values[csv_values != 'loan_id'][0]

loan_ids = set(csv_ups.astype(int))
print(f"Loaded {len(loan_ids)} unique loan IDs as integers.")

# Step 2: Filter performance_data.csv in chunks
chunk_size = 10**7  # Process 10 million rows at a time
filtered_rows = 0

with pd.read_csv(performance_file, chunksize=chunk_size, low_memory=False) as reader:
    for i, chunk in enumerate(reader, start=1):
        print(f"Processing chunk {i}...")
        
        # Filter rows where loan_id matches
        filtered_chunk = chunk[chunk["loan_id"].isin(loan_ids)]
        matches_in_chunk = len(filtered_chunk)
        filtered_rows += matches_in_chunk
        # Append the filtered chunk to the output file
        mode = 'w' if i == 1 else 'a'  # Write mode for the first chunk, append for the rest
        filtered_chunk.to_csv(filtered_output_file, mode=mode, index=False, header=(i == 1))
        print(f"Chunk {i}: {matches_in_chunk} matches written.")

print(f"Filtered data saved to {filtered_output_file}.")
print(f"Total rows in filtered dataset: {filtered_rows}") 
