Question No: 7
Context Size: 19035

Codebase:

File: sales_analysis.py:

```Python
import pandas as pd
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
import numpy as np
from scipy.stats import zscore

def aggregate_features(df, id_col, feature_cols, agg_funcs):
    if id_col not in df.columns:
        raise ValueError(f"The identifier column '{id_col}' is not in the dataframe.")
       
    missing_columns = [col for col in feature_cols if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following feature columns are not in the dataframe: {missing_columns}")

    aggregated_df = df.groupby(id_col)[feature_cols].agg(agg_funcs)
    aggregated_df.columns = ['_'.join(col) for col in aggregated_df.columns]

    return aggregated_df.reset_index()

def compute_difference_or_ratio(df, col1, col2, new_col_name):
    # Step 1: Validate columns
    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError(f"Columns '{col1}' and/or '{col2}' are not present in the dataframe")
    
    if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
        # Step 2: Perform numerical operation
        df[new_col_name] = df[col1] / df[col2]
        # Step 4: Handle NaN values
        df[new_col_name].fillna(0, inplace=True)
    elif pd.api.types.is_datetime64_any_dtype(df[col1]) and pd.api.types.is_datetime64_any_dtype(df[col2]):
        # Step 2: Perform datetime operation
        df[new_col_name] = (df[col1] - df[col2]).dt.days
        # Step 4: Handle NaN values
        df[new_col_name].fillna(0, inplace=True)
    else:
        raise ValueError(f"Columns '{col1}' and '{col2}' must be both numeric or both datetime")
    
    return df

def extract_and_aggregate_date(df, date_column, new_column_prefix):
    if date_column not in df.columns:
        raise ValueError(f"Column '{date_column}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
        raise ValueError(f"Column '{date_column}' must be of datetime type")
    
    df = df.copy()
    df[f'{new_column_prefix}_year'] = df[date_column].dt.year
    df[f'{new_column_prefix}_month'] = df[date_column].dt.month
    df[f'{new_column_prefix}_day'] = df[date_column].dt.day
    
    aggregated_df = df.groupby([f'{new_column_prefix}_year', f'{new_column_prefix}_month']).mean().reset_index()
    
    return aggregated_df

def calculate_normalized_kurtosis_skewness(df, col, method='kurtosis'):
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[col]):
        raise ValueError(f"Column '{col}' must be numeric")
    
    # Step 1: Normalize the column
    norm_col = (df[col] - df[col].mean()) / df[col].std()
    
    # Step 2: Calculate kurtosis or skewness
    if method == 'kurtosis':
        result = kurtosis(norm_col)
    elif method == 'skewness':
        result = skew(norm_col)
    else:
        raise ValueError("Invalid method. Choose 'kurtosis' or 'skewness'")
    
    return result

def aggregate_scaled_numerics(df, cat_col, num_col, scale_method='min_max'):
    if cat_col not in df.columns or num_col not in df.columns:
        raise ValueError(f"Columns '{cat_col}' and/or '{num_col}' not found in DataFrame")
    if not pd.api.types.is_categorical_dtype(df[cat_col]):
        raise ValueError(f"Column '{cat_col}' must be categorical")
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")
    
    # Step 1: Apply scaling
    if scale_method == 'min_max':
        scaler = MinMaxScaler()
    elif scale_method == 'z_score':
        scaler = StandardScaler()
    else:
        raise ValueError("Invalid scale method. Choose 'min_max' or 'z_score'")
    
    df[f'{num_col}_scaled'] = scaler.fit_transform(df[[num_col]])
    
    # Step 2: Calculate mean and standard deviation of scaled column per category
    aggregated_stats = df.groupby(cat_col).agg(
        original_mean=(num_col, 'mean'),
        original_std=(num_col, 'std'),
        scaled_mean=(f'{num_col}_scaled', 'mean'),
        scaled_std=(f'{num_col}_scaled', 'std')
    )
    
    return aggregated_stats.reset_index()

def sales_enrichment(sales_df, cust_df, prod_df):
    if 'customer_id' not in sales_df.columns or 'product_id' not in sales_df.columns:
        raise ValueError("sales_df must contain 'customer_id' and 'product_id' columns")

    sales_df = sales_df.copy()
    sales_df = sales_df.merge(cust_df, on='customer_id', how='left')
    sales_df = sales_df.merge(prod_df, on='product_id', how='left')
    
    customer_sales_summary = sales_df.groupby('customer_id')['sales_amount'].sum().reset_index()
    product_sales_summary = sales_df.groupby('product_id')['sales_amount'].sum().reset_index()
    
    return sales_df, customer_sales_summary, product_sales_summary

def handle_large_values(df, col, threshold, method):
    if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
        raise ValueError(f"Column '{col}' not found or is not numeric in DataFrame")
    if threshold < 0:
        raise ValueError("Threshold must be a non-negative number")
    if method not in ['clip', 'drop']:
        raise ValueError("Method must be either 'clip' or 'drop'")
    
    df = df.copy()
    
    if method == 'clip':
        df[col] = df[col].clip(upper=threshold)
    else:
        df = df[df[col] <= threshold]
    
    return df

def kmeans_clustering(df, numerical_cols, k):
    for col in numerical_cols:
        if col not in df.columns:
            raise ValueError(f"Numerical column {col} is not in the DataFrame")
    
    kmeans = KMeans(n_clusters=k)
    df['cluster'] = kmeans.fit_predict(df[numerical_cols])

    centroids = pd.DataFrame(kmeans.cluster_centers_, columns=numerical_cols)
    
    return df, centroids

def rolling_mean_normalize(df, num_col):
    if num_col not in df.columns:
        raise ValueError(f"Column '{num_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")
    
    rolling_mean_col = f"{num_col}_rolling_mean"
    df[rolling_mean_col] = df[num_col].rolling(window=5).mean().fillna(method='bfill')
    
    min_val = df[rolling_mean_col].min()
    max_val = df[rolling_mean_col].max()
    
    df[rolling_mean_col] = (df[rolling_mean_col] - min_val) / (max_val - min_val)
    
    return df

def grouped_timeseries_analysis(df, group_col, time_col, value_col):
    # Sort the dataframe by group_col and time_col
    df = df.sort_values(by=[group_col, time_col])
    
    # Compute the cumulative sum of value_col for each group
    df['cumulative_sum'] = df.groupby(group_col)[value_col].cumsum()
    
    # Calculate the exponentially weighted moving average for value_col within each group
    df['ewm'] = df.groupby(group_col)[value_col].transform(lambda x: x.ewm(span=10, adjust=False).mean())
    
    # Compute Z-scores and remove outliers
    df['z_score'] = df.groupby(group_col)[value_col].transform(lambda x: zscore(x))
    df = df[df['z_score'].abs() <= 3]
    
    return df.drop(columns=['z_score'])

def pivot_and_fill_na(df, pivot_index, pivot_columns, pivot_values, filler=0):
    for col in pivot_index + pivot_columns + [pivot_values]:
        if col not in df.columns:
            raise ValueError(f"The column '{col}' specified is not present in the dataframe")
    
    pivot_table = df.pivot_table(index=pivot_index, columns=pivot_columns, values=pivot_values, aggfunc='first')
    pivot_table.fillna(filler, inplace=True)
    
    return pivot_table.reset_index()
```

File: binner.py:

```Python
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def rank_within_groups(df, group_by_cols, rank_col):
    # Step 1: Validate columns
    missing_columns = [col for col in group_by_cols + [rank_col] if col not in df.columns]
    if missing_columns:
        raise ValueError(f"These columns are not in the dataframe: {missing_columns}")
    
    # Step 2: Group by specified columns and rank within each group
    df['rank'] = df.groupby(group_by_cols)[rank_col].rank(method='first')
    
    return df

def scale_columns(df, columns_to_scale, scale_type):
    df_copy = df.copy()
    
    if scale_type == 'minmax':
        scaler = MinMaxScaler()
        df_copy[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    elif scale_type == 'standard':
        scaler = StandardScaler()
        df_copy[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    else:
        raise ValueError("Scale type must be either 'minmax' or 'standard'")
    
    return df_copy

def create_frequency_table(df, categorical_columns):
    missing_cols = [col for col in categorical_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns {missing_cols} not found in DataFrame")
    non_categorical_cols = [col for col in categorical_columns if not pd.api.types.is_categorical_dtype(df[col])]
    if non_categorical_cols:
        raise ValueError(f"Columns {non_categorical_cols} must be of categorical type")
    
    freq_table = pd.DataFrame()
    for col in categorical_columns:
        freq = df[col].value_counts().reset_index()
        freq.columns = [col, 'frequency']
        freq['proportion'] = freq['frequency'] / len(df)
        freq_table = pd.concat([freq_table, freq], axis=0)
    
    return freq_table.reset_index(drop=True)

def category_combination_stats(df, categories, agg_col):
    grouped = df.groupby(categories)[agg_col].agg(['sum', 'mean', 'std']).reset_index()
    return grouped

def impute_and_calculate_range(df, group_col, cols_to_impute):
    df = df.copy()
    grouped = df.groupby(group_col)
    
    for col in cols_to_impute:
        df[col] = grouped[col].ffill().bfill()
        
        max_vals = grouped[col].transform('max')
        min_vals = grouped[col].transform('min')
        df[f'{col}_range'] = max_vals - min_vals
    
    return df

def compute_ewm_difference(df, col1, col2, window_size):
    # Step 1: Validate columns
    required_columns = [col1, col2]
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' must be present in the DataFrame.")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric.")
    
    df = df.copy()
    
    # Step 2: Compute difference
    df['diff'] = df[col1] - df[col2]
    
    # Step 3: Compute exponentially weighted moving average
    df['ewm_diff'] = df['diff'].ewm(span=window_size, adjust=False).mean()
    
    # Step 4: Replace NaN values
    df['ewm_diff'].fillna(df['ewm_diff'].mean(), inplace=True)
    
    return df

def validate_foreign_keys(df, primary_key, fk_dictionary):
    # Step 1: Ensure primary key column and foreign key columns exist
    if primary_key not in df.columns:
        raise ValueError(f"Primary key column {primary_key} does not exist in DataFrame")

    for fk_col, fk_table in fk_dictionary.items():
        if fk_col not in df.columns:
            raise ValueError(f"Foreign key column {fk_col} does not exist in DataFrame")

    # Step 2: Validate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        if not df[fk_col].isin(fk_table[primary_key]).all():
            print(f"Foreign key constraint violated on column {fk_col}")

    # Step 3: Remove rows that violate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        valid_rows = df[fk_col].isin(fk_table[primary_key])
        df = df[valid_rows]

    return df

def add_normalized_interaction_term(df, col1, col2, new_col_name):
    # Step 1: Validate columns
    for col in [col1, col2]:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
    
    # Step 2: Calculate correlation (not necessary for other steps but can be useful context)
    correlation = df[[col1, col2]].corr().iloc[0, 1]
    print(f"Correlation between {col1} and {col2}: {correlation}")
    
    # Step 3: Create interaction term
    df[new_col_name] = df[col1] * df[col2]
    
    # Step 4: Normalize interaction term
    scaler = MinMaxScaler()
    df[new_col_name] = scaler.fit_transform(df[[new_col_name]])
    
    return df

def bin_numeric_values(df, target_column, bucket_count):
    if target_column not in df.columns:
        raise ValueError(f"Column {target_column} is not in the dataframe")
    if not pd.api.types.is_numeric_dtype(df[target_column]):
        raise ValueError(f"Column {target_column} must be numeric")
    
    bins = np.linspace(df[target_column].min(), df[target_column].max(), bucket_count + 1)
    bin_labels = [f'bin_{i}' for i in range(1, bucket_count + 1)]
    
    df[f'{target_column}_bin'] = pd.cut(df[target_column], bins=bins, labels=bin_labels, include_lowest=True)
    
    return df

def sliding_window_summarization(df, window_size, step_size):
    if window_size <= 0:
        raise ValueError("Window size must be greater than 0.")
    if step_size <= 0:
        raise ValueError("Step size must be greater than 0.")
    if df.shape[0] < window_size:
        raise ValueError("DataFrame has fewer rows than the window size.")
    
    windows = [
        df.iloc[i:i + window_size] 
        for i in range(0, df.shape[0] - window_size + 1, step_size)
    ]
    
    summary_data = {
        f"{col}_mean": [window[col].mean() for window in windows] for col in df.select_dtypes(include='number').columns
    }
    summary_data.update({
        f"{col}_std": [window[col].std() for window in windows] for col in df.select_dtypes(include='number').columns
    })
    
    return pd.DataFrame(summary_data)

def mean_by_top_categories(df, cat_col, num_col, top_n):
    # Step 1: Confirm that the categorical column exists
    if cat_col not in df.columns:
        raise ValueError(f"Column '{cat_col}' not found in DataFrame")
    
    # Step 2: Ensure the numeric column is numeric
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")
    
    # Step 3: Identify top N most frequent categories
    top_categories = df[cat_col].value_counts().nlargest(top_n).index
    
    # Step 4: Calculate the mean for each of the top N categories
    means = df[df[cat_col].isin(top_categories)].groupby(cat_col)[num_col].mean().reset_index()
    
    result_df = means.rename(columns={num_col: f'{num_col}_mean'})
    
    return result_df
```

File: range_computations.py:

```Python
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def fill_and_encode(df, specific_cols):
    # Validate that all specific_cols exist in the dataframe
    missing_columns = [col for col in specific_cols if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Columns {missing_columns} not found in DataFrame")
    
    df = df.copy()
    
    # Handle missing values by forward fill followed by backward fill
    df[specific_cols] = df[specific_cols].ffill().bfill()
    
    # Handle categorical columns with one-hot encoding
    for col in specific_cols:
        if pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    
    return df

def normalize_product(df, col_a, col_b):
    # Step 1: Validate columns
    if col_a not in df.columns:
        raise ValueError(f"Column '{col_a}' not found in DataFrame")
    if col_b not in df.columns:
        raise ValueError(f"Column '{col_b}' not found in DataFrame")
    if not (pd.api.types.is_numeric_dtype(df[col_a]) and pd.api.types.is_numeric_dtype(df[col_b])):
        raise ValueError(f"Both columns '{col_a}' and '{col_b}' must be numeric")

    # Step 2: Compute pointwise product
    product_col = df[col_a] * df[col_b]

    # Step 3: Normalize the product column
    mean_product = product_col.mean()
    std_product = product_col.std()
    df["normalized_product"] = (product_col - mean_product) / std_product

    return df

def flag_high_percentile(df, numerical_columns, percentile):
    if not 0 <= percentile <= 100:
        raise ValueError("Percentile must be between 0 and 100")
    
    df_copy = df.copy()
    
    for col in numerical_columns:
        threshold = df[col].quantile(percentile / 100.0)
        df_copy[f'{col}_above_{percentile}'] = df[col] > threshold
    
    return df_copy

def scale_within_categories(df, cat_col, num_col, scale_type):
    if cat_col not in df.columns or num_col not in df.columns:
        raise ValueError(f"Columns '{cat_col}' or '{num_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")

    df = df.copy()
    scaler = None
    scaled_col_name = f"{num_col}_{scale_type}"

    if scale_type == 'min-max':
        scaler = MinMaxScaler()
    elif scale_type == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f"Unknown scaling type: {scale_type}")

    df[scaled_col_name] = df.groupby(cat_col)[num_col].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())

    return df

def temporal_averages(df, date_col, target_col):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['week'] = df[date_col].dt.isocalendar().week
    df['day_of_week'] = df[date_col].dt.dayofweek
    df['quarter'] = df[date_col].dt.quarter
    
    temporal_aggregations = ['year', 'month', 'week', 'day_of_week', 'quarter']
    
    for time_feature in temporal_aggregations:
        group_avgs = df.groupby(time_feature)[target_col].transform('mean')
        df[f'{target_col}_avg_by_{time_feature}'] = group_avgs
    
    return df

def create_pivot_table(df, cols_to_pivot, agg_func):
    # Validate presence of cols_to_pivot and ensure they are categorical
    for col in cols_to_pivot:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not categorical in DataFrame")
    
    # Create pivot table
    pivot_df = df.pivot_table(index=cols_to_pivot[0], columns=cols_to_pivot[1:], aggfunc=agg_func, margins=True)
    
    # Convert pivot table to DataFrame
    pivot_df = pivot_df.reset_index()
    
    return pivot_df

def impute_with_rolling_mean(df, fill_columns, rolling_window, min_periods):
    rolling_means = df[fill_columns].rolling(window=rolling_window, min_periods=min_periods).mean()
    df[fill_columns] = df[fill_columns].fillna(rolling_means)
    
    for col in fill_columns:
        df[f'{col}_rolling_mean'] = rolling_means[col]
        df[f'{col}_rate_of_change'] = rolling_means[col].diff()
    
    return df

def compute_column_ranges(df, range_cols):
    # Step 1: Ensure columns are present and numerical
    for col in range_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not numerical")
    
    # Step 2: Compute the range for each column
    ranges = {}
    for col in range_cols:
        ranges[col] = df[col].max() - df[col].min()
    
    # Step 3: Calculate the average range
    avg_range = sum(ranges.values()) / len(ranges)
    
    # Step 4: Store average range in new column
    df['average_range'] = avg_range
    
    return df

def detect_and_remove_outliers(df, grouping_col, value_col):
    # Step 1: Calculate median and IQR for each group
    grouped = df.groupby(grouping_col)[value_col]
    medians = grouped.median().reset_index(name=f'{value_col}_median')
    iqr = grouped.quantile(0.75) - grouped.quantile(0.25)
    
    # Step 2: Flag entries deviating by more than 3*IQR
    deviations = df[value_col] - df.merge(medians, on=grouping_col)[f'{value_col}_median']
    threshold = 3 * iqr.reset_index(name=f'{value_col}_iqr')[f'{value_col}_iqr']
    df['flag'] = deviations.abs() > threshold[0]
    
    # Step 3: Filter out flagged entries
    filtered_df = df[df['flag'] == False].drop(columns=['flag'])
    
    # Step 4: Summary of flagged entries per group
    summary = df[df['flag']].groupby(grouping_col)['flag'].count().reset_index(name='flag_count')
    
    return filtered_df, summary

def filter_high_frequency_binary_columns(df, cat_columns, threshold):
    # Step 1: Ensure all categorical columns are present in the dataframe
    for col in cat_columns:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not categorical")
    
    # Step 2: Calculate mode value for each categorical column
    mode_values = {col: df[col].mode()[0] for col in cat_columns}
    
    # Step 3: Convert categorical columns to binary columns using one-hot encoding
    df_one_hot = pd.get_dummies(df, columns=cat_columns)
    
    # Step 4: Filter out binary columns where mode value frequency exceeds the given threshold
    columns_to_keep = []
    for col, mode_val in mode_values.items():
        for one_hot_col in df_one_hot.columns:
            if one_hot_col.startswith(f"{col}_{mode_val}") and \
               (df_one_hot[one_hot_col].mean() <= threshold):
                columns_to_keep.append(one_hot_col)
    
    filtered_df = df_one_hot[columns_to_keep]
    
    return filtered_df

def filter_non_numeric_columns_by_uniqueness(df, non_numeric_columns, threshold=10):
    if not isinstance(threshold, int) or threshold <= 0:
        raise ValueError("Threshold must be a positive integer.")
        
    df = df.copy()
    
    valid_columns = [
        col for col in non_numeric_columns if col in df.columns and 
        not pd.api.types.is_numeric_dtype(df[col]) and 
        df[col].nunique() >= threshold
    ]
    
    return df[valid_columns]
```

File: normalization.py:

```Python
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.cluster import KMeans

def conditional_encoding(df, category_col, target_col):
    df = df.copy()
    
    # Encode category column by frequency
    freq_encoding = df[category_col].value_counts().to_dict()
    df[f'{category_col}_encoded'] = df[category_col].map(freq_encoding)
    
    # Scale target column within each encoded group
    scaler = StandardScaler()
    df[target_col] = df.groupby(f'{category_col}_encoded')[target_col].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())
    
    # Create dummy variables for the encoded category column
    dummies = pd.get_dummies(df[f'{category_col}_encoded'], prefix=category_col)
    df = pd.concat([df, dummies], axis=1)
    
    return df

def impute_by_group_mean(df, threshold, group_key):
    df = df.copy()
    
    # Step 1: Identify numerical columns with more than threshold % missing values
    missing_threshold = df.isnull().mean()
    columns_to_impute = missing_threshold[missing_threshold > threshold].index
    
    for col in columns_to_impute:
        # Step 2: Compute group mean
        group_mean = df.groupby(group_key)[col].transform('mean')
        
        # Step 3: Impute missing values using group mean
        df[col].fillna(group_mean, inplace=True)

    return df

def generate_n_grams(df, text_col, n_grams, min_freq):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams), min_df=min_freq)
    n_gram_matrix = vectorizer.fit_transform(df[text_col])
    n_gram_freq = n_gram_matrix.sum(axis=0).A1
    n_grams_df = pd.DataFrame({'n_gram': vectorizer.get_feature_names_out(), 'frequency': n_gram_freq})
    
    return n_grams_df

def handle_categorical_and_numerical(df, cat_cols, num_cols):
    df = df.copy()
    
    for col in cat_cols:
        if col not in df.columns or not df[col].dtype.name == 'category':
            raise ValueError(f"Column '{col}' must be categorical and exist in DataFrame")
    
    for col in num_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric and exist in DataFrame")
    
    df = pd.get_dummies(df, columns=cat_cols)
    
    for col in num_cols:
        col_mean = df[col].mean()
        col_std = df[col].std()
        col_median = df[col].median()
        
        df[col] = df[col].apply(lambda x: col_median if abs(x - col_mean) > 3 * col_std else x)
    
    return df

def impute_missing_values(df, impute_cols, method):
    # Step 1: Ensure specified columns exist and contain missing values
    for col in impute_cols:
        if col not in df.columns or not df[col].isnull().any():
            raise ValueError(f"Column {col} is not present or does not contain missing values")
            
    # Step 2: Impute missing values based on method
    for col in impute_cols:
        if pd.api.types.is_numeric_dtype(df[col]):
            if method in ['mean', 'median', 'mode']:
                imputer = SimpleImputer(strategy=method if method != 'mode' else 'most_frequent')
            else:
                raise ValueError("Invalid method for numeric columns. Choose from 'mean', 'median', 'mode'")
        else:
            if method == 'most_frequent':
                imputer = SimpleImputer(strategy='most_frequent')
            else:
                raise ValueError("Invalid method for categorical columns. Use 'most_frequent'")
        
        df[col] = imputer.fit_transform(df[[col]])
    
    # Step 3: Ensure no missing values remain in these columns
    if df[impute_cols].isnull().any().any():
        raise ValueError(f"Imputation failed, missing values remain in columns {impute_cols}")
    
    return df

def linear_regression_prediction(df, col1, col2):
    if col1 not in df.columns:
        raise ValueError(f"Column '{col1}' not found in DataFrame")
    if col2 not in df.columns:
        raise ValueError(f"Column '{col2}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[col1]):
        raise ValueError(f"Column '{col1}' must be numeric")
    if not pd.api.types.is_numeric_dtype(df[col2]):
        raise ValueError(f"Column '{col2}' must be numeric")
    
    correlation = df[[col1, col2]].corr().iloc[0, 1]
    
    X = df[[col1]].values.reshape(-1, 1)
    y = df[col2].values
    model = LinearRegression()
    model.fit(X, y)
    df[f'predicted_{col2}'] = model.predict(X)
 
    return df, correlation

def normalize_and_threshold_groups(df, category_col, value_col, threshold):
    # Step 1: Normalize 'value_col' within each group in 'category_col'
    df['normalized_value'] = df.groupby(category_col)[value_col].transform(lambda x: (x - x.mean()) / x.std())
    
    # Step 2: Identify groups with mean of normalized 'value_col' exceeding 'threshold'
    group_means = df.groupby(category_col)['normalized_value'].mean().reset_index(name='mean_normalized_value')
    high_value_groups = group_means[group_means['mean_normalized_value'] > threshold]
    
    # Step 3: Generate summary statistics for identified groups
    summary_stats = high_value_groups.describe()
    
    return df, summary_stats

def rolling_window_diff(df, sequence_col, window_size):
    df = df.copy()

    df[f'{sequence_col}_rolling_mean'] = df[sequence_col].rolling(window=window_size).mean()
    df[f'{sequence_col}_rolling_std'] = df[sequence_col].rolling(window=window_size).std()
    df[f'{sequence_col}_rolling_min'] = df[sequence_col].rolling(window=window_size).min()
    df[f'{sequence_col}_rolling_max'] = df[sequence_col].rolling(window=window_size).max()

    # Impute missing rolling statistics
    for col in [f'{sequence_col}_rolling_mean', f'{sequence_col}_rolling_std', f'{sequence_col}_rolling_min', f'{sequence_col}_rolling_max']:
        df[col].fillna(df[col].mean(), inplace=True)

    # Differencing to remove trend
    df[f'{sequence_col}_diff'] = df[sequence_col].diff().fillna(0)
    
    return df

def impute_missing_values(df, cols_to_impute, method):
    df = df.copy()
    
    for col in cols_to_impute:
        if method == 'mean':
            fill_value = df[col].mean()
        elif method == 'median':
            fill_value = df[col].median()
        elif method == 'mode':
            fill_value = df[col].mode()[0]
        else:
            raise ValueError("Method must be either 'mean', 'median', or 'mode'")
        
        # Step 1: Identify gaps
        if df[col].isna().sum() == len(df[col]):
            raise ValueError(f"Entire column '{col}' is NaN")
        
        # Step 2: Impute using specified method
        df[col] = df[col].fillna(fill_value)
    
    return df

def expand_categorical(df, categorical_cols, prefix_sep):
    # Step 1: Validate columns
    for col in categorical_cols:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column '{col}' either not found or not categorical")

    # Step 2: Create dummies with specified prefix separator
    df = pd.get_dummies(df, columns=categorical_cols, prefix_sep=prefix_sep)
    
    return df

def add_cluster_labels(df, num_clusters, clustering_columns):
    if not isinstance(num_clusters, int) or num_clusters <= 0:
        raise ValueError("'num_clusters' must be a positive integer")

    kmeans = KMeans(n_clusters=num_clusters)
    cluster_labels = kmeans.fit_predict(df[clustering_columns])
    
    df['cluster_label'] = cluster_labels
    
    return df
```

File: target_binning.py:

```Python
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
import numpy as np

def generate_polynomial_and_dummy_features(df, cat_cols, target_col):
    df = df.copy()
    
    dummy_df = pd.get_dummies(df[cat_cols], drop_first=True)
    df = pd.concat([df.drop(columns=cat_cols), dummy_df], axis=1)
    
    poly = PolynomialFeatures(degree=2)
    poly_features = poly.fit_transform(df[[target_col]])
    poly_columns = [f"{target_col}_poly_{i}" for i in range(poly_features.shape[1])]
    
    poly_df = pd.DataFrame(poly_features, columns=poly_columns)
    
    df = pd.concat([df, poly_df], axis=1).drop(columns=[target_col])
    
    scaler = StandardScaler()
    df[poly_columns] = scaler.fit_transform(df[poly_columns])
    
    return df

def feature_extraction_from_categories(df, category_col, value_col, new_col_prefix):
    if category_col not in df.columns or value_col not in df.columns:
        raise ValueError(f"Columns '{category_col}' or '{value_col}' not found in DataFrame")

    sum_ranked = df.groupby(category_col)[value_col].sum().nlargest(10).index
    mean_ranked = df.groupby(category_col)[value_col].mean().nlargest(10).index
    median_ranked = df.groupby(category_col)[value_col].median().nlargest(10).index

    df[f'{new_col_prefix}_top10_sum'] = df[category_col].apply(lambda x: 1 if x in sum_ranked else 0)
    df[f'{new_col_prefix}_top10_mean'] = df[category_col].apply(lambda x: 1 if x in mean_ranked else 0)
    df[f'{new_col_prefix}_top10_median'] = df[category_col].apply(lambda x: 1 if x in median_ranked else 0)
    
    return df

def normalize_numeric_columns(df, columns):
    for col in columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column: {col} is not numerical or does not exist")

    df = df.copy()
    
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        
        df[col] = (df[col] - mean) / std
        df[col].fillna(0, inplace=True)
    
    return df

def aggregate_by_category(df, id_column, category_column):
    # Step 1: Group by `id_column` and `category_column`
    grouped_df = df.groupby([id_column, category_column]).agg(['count', 'sum', 'mean', 'std'])
    
    # Step 2: Flatten multi-level columns
    grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
    
    return grouped_df.reset_index()

def filter_and_summarize_datetime_columns(df, datetime_cols, operation_type, threshold):
    # Step 1: Validate datetime columns
    for col in datetime_cols:
        if col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of datetime type")
    
    if operation_type not in ["count", "range"]:
        raise ValueError("Operation type must be either 'count' or 'range'")
    
    summary_report = []

    if operation_type == "count":
        # Step 2: Count occurrences and apply threshold
        for col in datetime_cols:
            counts = df[col].value_counts()
            df = df[df[col].apply(lambda x: counts[x] >= threshold[col])]
            summary_report.append({"column": col, "operation": "count", "remaining_rows": len(df)})
    
    elif operation_type == "range":
        # Step 3: Filter based on datetime range
        for col in datetime_cols:
            start, end = threshold[col]
            df = df[(df[col] >= start) & (df[col] <= end)]
            summary_report.append({"column": col, "operation": "range", "remaining_rows": len(df)})

    # Step 4: Return filtered DataFrame and summary report
    return df, pd.DataFrame(summary_report)

def analyze_threshold_violations(df, threshold, columns_to_analyze):
    # Step 1: Validate columns
    for col in columns_to_analyze:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
    
    # Step 2: Identify rows below threshold
    below_threshold = df[columns_to_analyze] < threshold
    
    # Step 3: Calculate proportions
    proportions = below_threshold.mean()
    
    # Step 4: Create summary DataFrame
    summary_df = pd.DataFrame(proportions, columns=['proportion_below_threshold'])
    
    return summary_df

def compute_tf_idf(df, text_col, word_list):
    df = df.copy()

    # Step 1: Tokenize the text
    vectorizer = TfidfVectorizer(vocabulary=word_list)
    tf_idf_matrix = vectorizer.fit_transform(df[text_col])

    # Step 2: Extract TF-IDF values
    tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Step 3: Append TF-IDF values to the original DataFrame
    df = pd.concat([df, tf_idf_df], axis=1)

    return df

def feature_interaction_with_mappings(df, col_mappings, target_col):
    if target_col not in df.columns:
        raise ValueError(f"Column {target_col} is not present in the dataframe")

    for col in col_mappings:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not present in the dataframe")
        df[col] = df[col].map(col_mappings[col])

    interaction_terms = []
    cols = list(col_mappings.keys())
    
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            new_col = f"{cols[i]}_x_{cols[j]}"
            df[new_col] = df[cols[i]] * df[cols[j]]
            interaction_terms.append(new_col)

    X = df[interaction_terms]
    y = df[target_col]
    
    model = Ridge()
    model.fit(X, y)

    return model, interaction_terms

def robust_scale_and_polynomial_transform(df, feature_cols, target_col):
    # Step 1: Ensure column presence and type checks
    for col in feature_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not numerical")
    
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not present or not numerical")
    
    # Step 2: Apply robust scaling to the feature columns
    scaler = RobustScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    # Step 3: Polynomial feature transformation
    poly = PolynomialFeatures(degree=3, include_bias=False)
    poly_features = poly.fit_transform(df[feature_cols])
    
    # Create a new DataFrame with polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(feature_cols))
    poly_df[target_col] = df[target_col]
    
    return poly_df

def bin_target_variable(df, target_variable, bins):
    # Validate target variable
    if target_variable not in df.columns:
        raise ValueError(f"Column {target_variable} not found in DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[target_variable]):
        raise ValueError(f"Column {target_variable} must be of numeric type")
    
    # Calculate bin edges
    bin_edges = np.linspace(df[target_variable].min(), df[target_variable].max(), bins + 1)
    
    # Apply binning
    df[target_variable + '_binned'] = pd.cut(df[target_variable], bins=bin_edges, labels=False, include_lowest=True)
    
    return df, bin_edges.tolist()

def conditional_column_creation(df, conditions, new_col):
    # Validate conditions
    for condition in conditions:
        if not (isinstance(condition, tuple) and len(condition) == 3):
            raise ValueError("Each condition must be a tuple (column, operation, value).")
    
    # Construct and evaluate boolean logic
    boolean_series = pd.Series(np.ones(len(df), dtype=bool))
    for col, op, val in conditions:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not in DataFrame")
        if op == '==':
            boolean_series &= (df[col] == val)
        elif op == '>':
            boolean_series &= (df[col] > val)
        elif op == '>=':
            boolean_series &= (df[col] >= val)
        elif op == '<':
            boolean_series &= (df[col] < val)
        elif op == '<=':
            boolean_series &= (df[col] <= val)
        elif op == '!=':
            boolean_series &= (df[col] != val)
        else:
            raise ValueError(f"Invalid operation '{op}' encountered.")
    
    # Add the new conditional column
    df[new_col] = boolean_series
    
    return df
```

File: percentage_processing.py:

```Python
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def assign_quantile_labels(df, quantile_ranges, col_to_quantile):
    # Step 1: Ensure specified column exists and is numerical
    if col_to_quantile not in df.columns:
        raise ValueError(f"Column {col_to_quantile} does not exist in DataFrame")
    
    if not np.issubdtype(df[col_to_quantile].dtype, np.number):
        raise ValueError(f"Column {col_to_quantile} is not numerical")

    # Step 2: Divide the column into quantile ranges
    quantiles = df[col_to_quantile].quantile(quantile_ranges).values
    
    # Step 3: Create a new column for quantile labels
    df['quantile_label'] = pd.cut(df[col_to_quantile], bins=quantiles, labels=['Q1', 'Q2', 'Q3', 'Q4'], include_lowest=True)
    
    return df

def compute_rolling_window_avg(df, id_col, timestamp_col, target_col, window_size):
    if id_col not in df.columns or timestamp_col not in df.columns or target_col not in df.columns:
        raise ValueError(f"Columns {id_col}, {timestamp_col}, and {target_col} must be present in DataFrame")

    df = df.copy()
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    df.sort_values(by=[id_col, timestamp_col], inplace=True)
    
    grouped = df.groupby(id_col)
    df[f"rolling_avg_{target_col}"] = grouped[target_col].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    
    return df

def split_and_summarize(df, threshold, target_col):
    # Step 1: Validate target column presence and type
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' is not found in the DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]) and not pd.api.types.is_bool_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be boolean or numeric")

    # Step 2: Separate data into two groups based on threshold
    group_1 = df[df[target_col] >= threshold]
    group_2 = df[df[target_col] < threshold]

    # Step 3: Calculate statistical summaries
    summary = {
        'Group 1': {
            'mean': group_1.mean().to_dict(),
            'median': group_1.median().to_dict(),
            'std': group_1.std().to_dict()
        },
        'Group 2': {
            'mean': group_2.mean().to_dict(),
            'median': group_2.median().to_dict(),
            'std': group_2.std().to_dict()
        }
    }
    
    return summary

def target_encode_categories(df, cat_cols, target_col):
    df = df.copy()
    
    for cat_col in cat_cols:
        mean_encoding = df.groupby(cat_col)[target_col].mean()
        std_encoding = df.groupby(cat_col)[target_col].std()
        
        encoded_value = df[cat_col].map(mean_encoding)
        std_value = df[cat_col].map(std_encoding)
        df[cat_col] = encoded_value.fillna(std_value)  # Replacing with target-encoded values

    return df

def forward_fill_and_interaction(df, columns, step):
    df[columns] = df[columns].ffill(limit=step)
    
    for col in columns:
        df[f'{col}_scaled'] = (df[col] - df[col].mean()) / df[col].std()
        df[f'{col}_log'] = np.log1p(df[col])
    
    interaction_terms = pd.DataFrame(index=df.index)
    for i, col1 in enumerate(columns):
        for col2 in columns[i+1:]:
            interaction_terms[f'{col1}_x_{col2}'] = df[col1] * df[col2]
    
    return pd.concat([df, interaction_terms], axis=1)

def filter_text_by_frequency(df, text_col, min_freq):
    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found in DataFrame")

    all_words = ' '.join(df[text_col]).split()
    word_freq = Counter(all_words)
    
    df[f'{text_col}_filtered'] = df[text_col].apply(lambda x: ' '.join([word for word in x.split() if word_freq[word] >= min_freq]))
    
    return df

def generate_n_grams(df, text_col, n_gram_range):
    if text_col not in df.columns:
        raise ValueError(f"Text column '{text_col}' is not in the dataframe.")
    if not pd.api.types.is_string_dtype(df[text_col]):
        raise ValueError(f"Column '{text_col}' must be of string type.")
    
    vectorizer = CountVectorizer(ngram_range=n_gram_range)
    n_gram_matrix = vectorizer.fit_transform(df[text_col])
    n_gram_df = pd.DataFrame(n_gram_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df.reset_index(drop=True), n_gram_df], axis=1)

    return df

def ewma_target(df, id_col, date_col, target_col):
    if id_col not in df.columns or date_col not in df.columns or target_col not in df.columns:
        raise ValueError("One or more specified columns not found in the DataFrame")
    
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        raise ValueError(f"Column '{date_col}' must be of datetime type")
    
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    df = df.sort_values(by=[id_col, date_col])
    
    ewm_grouped = df.groupby(id_col)[target_col].transform(lambda x: x.ewm(span=10, adjust=False).mean())
    df['EWMA_Target'] = ewm_grouped
    
    return df

def handle_outliers(df, reference_col):
    if reference_col not in df.columns or not pd.api.types.is_numeric_dtype(df[reference_col]):
        raise ValueError(f"Column '{reference_col}' not found or not of numerical type")
    
    modified_df = df.copy()
    for col in df.select_dtypes(include='number').columns:
        if col != reference_col:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1

            if IQR == 0:
                continue

            median = df[col].median()
            z_scores = (df[col] - median) / IQR
            modified_df.loc[z_scores.abs() > 3, col] = median
    return modified_df

def compute_weighted_average(df, group_col, weight_col, target_col):
    missing_columns = [col for col in [group_col, weight_col, target_col] if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    grouped = df.groupby(group_col).apply(lambda x: (x[weight_col] * x[target_col]).sum() / x[weight_col].sum())
    weighted_average_df = grouped.reset_index()
    weighted_average_df.columns = [group_col, 'weighted_average']
    
    weighted_average_df = weighted_average_df.sort_values(by='weighted_average', ascending=False).reset_index(drop=True)
    
    return weighted_average_df

def replace_high_percentage_with_sum(df, percentage_col, threshold):
    df = df.copy()

    # Step 1: Identify rows where the percentage exceeds the threshold
    high_percentage_rows = df[percentage_col] > threshold

    # Step 2: Calculate the sum of all other numeric columns for these rows
    other_numeric_cols = df.select_dtypes(include=[float, int]).columns.drop(percentage_col)
    sums = df.loc[high_percentage_rows, other_numeric_cols].sum(axis=1)

    # Step 3: Replace the values in 'percentage_col' with the sums
    df.loc[high_percentage_rows, percentage_col] = sums

    return df
```

File: frequency_encoding.py:

```Python
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def group_and_compute_operation(df, id_col, value_col, operation):
    # Step 1: Validate columns
    if id_col not in df.columns or not pd.api.types.is_numeric_dtype(df[id_col]):
        raise ValueError(f"ID col {id_col} is not present or not of numeric type")
    if value_col not in df.columns or not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Value col {value_col} is not present or not of numeric type")
    
    valid_operations = ["sum", "mean", "max", "min"]
    if operation not in valid_operations:
        raise ValueError(f"Operation must be one of {valid_operations}")
    
    # Step 2: Perform the specified operation
    if operation == "sum":
        grouped_df = df.groupby(id_col)[value_col].sum().reset_index()
    elif operation == "mean":
        grouped_df = df.groupby(id_col)[value_col].mean().reset_index()
    elif operation == "max":
        grouped_df = df.groupby(id_col)[value_col].max().reset_index()
    elif operation == "min":
        grouped_df = df.groupby(id_col)[value_col].min().reset_index()
    
    return grouped_df

def encode_categorical_values(df, categorical_columns, method):
    if method not in ['onehot', 'label']:
        raise ValueError("Invalid method. Choose from 'onehot' or 'label'.")

    df = df.copy()
    
    for col in categorical_columns:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not in the dataframe")
        
        if method == 'onehot':
            onehot = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, onehot], axis=1).drop(col, axis=1)
        else:  # label
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col])
    
    return df

def replace_with_frequency(df, cat_cols):
    df = df.copy()
    
    for col in cat_cols:
        freq = df[col].value_counts().to_dict()
        df[col + '_freq'] = df[col].map(freq)
    
    return df

def apply_weighted_sum(df, label_col, value_col, weight):
    if label_col not in df.columns:
        raise ValueError(f"Column '{label_col}' not found in DataFrame")
    
    for col in value_col:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
    
    df = df.copy()
    for col in value_col:
        df[col] *= weight
    
    df['weighted_sum'] = df[value_col].sum(axis=1)
    
    return df

def compute_rolling_average(df, price_col, window_size):
    if not pd.api.types.is_numeric_dtype(df[price_col]):
        raise ValueError(f"Column '{price_col}' must be numeric")
        
    df = df.copy()
    df[price_col] = (df[price_col] - df[price_col].mean()) / df[price_col].std()
    
    df['Rolling_Avg_Price'] = df[price_col].rolling(window=window_size).mean()
    df['Rolling_Avg_Price'].fillna(df['Rolling_Avg_Price'].mean(), inplace=True)
    
    return df

def add_term_presence_column(df, text_column, new_column_name, terms_list):
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in DataFrame")
    if not pd.api.types.is_object_dtype(df[text_column]):
        raise ValueError(f"Text column '{text_column}' must be of object type")

    # Step 2: Tokenize text and check for term presence
    df[new_column_name] = df[text_column].apply(
        lambda text: any(term in text.split() for term in terms_list)
    )

    return df

def hierarchical_aggregation_and_scaling(df, cat_col, agg_dict):
    agg_df = df.groupby(cat_col).agg(agg_dict)
    
    hierarchical_index = agg_df.index
    agg_df = agg_df.reset_index()
    agg_df.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in agg_df.columns.values]
    
    scaler = StandardScaler()
    agg_df_scaled = scaler.fit_transform(agg_df.iloc[:, 1:])
    
    standardized_df = pd.DataFrame(agg_df_scaled, columns=agg_df.columns[1:], index=hierarchical_index)
    
    return standardized_df

def string_categorization_and_target_scaling(df, string_cols, categories, target_col):
    for col in string_cols:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not in the dataframe.")
        
        for category in categories:
            df[f"{col}_{category}"] = df[col].apply(lambda x: 1 if category in str(x) else 0)
        
        df.drop(columns=[col], inplace=True)
    
    for new_col in [f"{col}_{category}" for col in string_cols for category in categories]:
        df[new_col] = df[new_col] / df[new_col].sum()
    
    df[f"{target_col}_scaled"] = (df[target_col] - df[target_col].min()) / (df[target_col].max() - df[target_col].min())
    
    return df

def limit_categories(df, category_cols, top_n):
    missing_columns = [col for col in category_cols if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the dataframe: {missing_columns}")

    for col in category_cols:
        top_categories = df[col].value_counts().nlargest(top_n).index
        df[col] = df[col].apply(lambda x: x if x in top_categories else "Other")
    
    return df

def split_and_bin_column(df, col_to_split, max_percent, new_col_prefix):
    # Step 1: Validate column presence and type
    if col_to_split not in df.columns or not pd.api.types.is_string_dtype(df[col_to_split]):
        raise ValueError(f"Column '{col_to_split}' is not present or not a string/object type")

    # Step 2: Split column into unique values until they are <= max_percent of rows
    unique_values = df[col_to_split].unique()
    max_unique_values = int(max_percent * len(df))
    if len(unique_values) > max_unique_values:
        unique_values = unique_values[:max_unique_values]

    # Step 3: Create binary columns for each unique value
    for val in unique_values:
        new_col_name = f"{new_col_prefix}_{val}"
        df[new_col_name] = df[col_to_split].apply(lambda x: 1 if x == val else 0)

    return df

def aggregate_and_weigh_statistics(df, target_col, group_cols):
    # Step 1: Calculate aggregated mean, median, and std for each group combination
    group_stats = df.groupby(group_cols)[target_col].agg(['mean', 'median', 'std']).reset_index()
    
    # Step 2: Calculate the group size
    group_sizes = df.groupby(group_cols).size().reset_index(name='size')
    group_stats = group_stats.merge(group_sizes, on=group_cols)
    
    # Step 3: Perform weighted averaging
    group_stats['weighted_mean'] = group_stats['mean'] / group_stats['size']
    group_stats['weighted_median'] = group_stats['median'] / group_stats['size']
    group_stats['weighted_std'] = group_stats['std'] / group_stats['size']
    
    # Step 4: Normalize the statistics to unit mean and variance
    scaler = StandardScaler()
    stats_to_normalize = ['weighted_mean', 'weighted_median', 'weighted_std']
    group_stats[stats_to_normalize] = scaler.fit_transform(group_stats[stats_to_normalize])
    
    # Merge back to original dataframe
    df = df.merge(group_stats[group_cols + stats_to_normalize], on=group_cols, how='left')
    
    return df
```

File: feature_engineering.py:

```Python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import zscore
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def aggregate_and_rank(df, groupby_columns, agg_columns):
    # Step 1: Validate columns
    for col in groupby_columns + agg_columns:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not present in the dataframe")

    # Step 2: Groupby and aggregate
    agg_funcs = {col: ['mean', 'median', 'sum'] for col in agg_columns}
    grouped_df = df.groupby(groupby_columns).agg(agg_funcs)

    # Flatten MultiIndex columns
    grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
    grouped_df = grouped_df.reset_index()

    # Step 3: Merge back with the original dataframe
    df = df.merge(grouped_df, on=groupby_columns, how='left')

    # Step 4: Create ranking column for one of the aggregation columns
    rank_col = agg_columns[0]
    df[rank_col + '_rank'] = df.groupby(groupby_columns)[rank_col].rank(method='first', ascending=False)

    return df

def filter_encode_categories(df, cat_columns, min_support):
    # Step 1: Ensure columns are present and categorical.
    for col in cat_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is not found in the DataFrame")
        if not pd.api.types.is_object_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be of object type")
    
    for col in cat_columns:
        # Step 2: Filter Rare Categories
        value_counts = df[col].value_counts(normalize=True)
        valid_categories = value_counts[value_counts >= min_support].index
        df[col] = df[col].apply(lambda x: x if x in valid_categories else None)
        
        # Step 3: One-Hot Encoding
        one_hot_encoded = pd.get_dummies(df[col], prefix=col)
        df = df.drop(col, axis=1)
        df = pd.concat([df, one_hot_encoded], axis=1)
        
    return df

def apply_rolling_statistics(df, window_size, min_periods):
    if df.empty:
        raise ValueError("Input dataframe is empty")
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    datetime_columns = df.select_dtypes(include=['datetime']).columns.tolist()
    if not numeric_columns and not datetime_columns:
        raise ValueError("Dataframe must have at least one numeric or datetime column")

    df = df.copy()

    # Compute rolling mean for numeric columns
    for col in numeric_columns:
        df[f'{col}_rolling_mean_{window_size}'] = df[col].rolling(window=window_size, min_periods=min_periods).mean()

    # Compute rolling count of non-null values for datetime columns
    for col in datetime_columns:
        df[f'{col}_rolling_count_{window_size}'] = df[col].rolling(window=window_size, min_periods=min_periods).count()

    return df

def create_lagged_features(df, date_col, features, periods):
    if date_col not in df.columns:
        raise ValueError(f"Column '{date_col}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        raise ValueError(f"Column '{date_col}' must be of datetime type")

    for feature in features:
        if feature not in df.columns:
            raise ValueError(f"Feature column '{feature}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[feature]):
            raise ValueError(f"Feature column '{feature}' must be numeric")
    
    df = df.copy()
    df.sort_values(by=date_col, inplace=True)

    for period in periods:
        for feature in features:
            df[f"{feature}_lag_{period}"] = df[feature].shift(period)
    
    df.dropna(inplace=True)
    return df

def bin_numerical_data(df, numerical_cols, criteria_col):
    median_val = df[criteria_col].median()
    filtered_df = df[df[criteria_col] > median_val]
    
    for col in numerical_cols:
        bins = pd.qcut(filtered_df[col], 4, labels=False)
        df.loc[filtered_df.index, f"{col}_binned"] = bins
    
    return df

def stratified_partition(df, partition_cols, train_ratio):
    df = df.copy()
    
    for col in partition_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    
    if not (0 < train_ratio < 1):
        raise ValueError(f"train_ratio must be a float between 0 and 1")
    
    train_df, test_df = train_test_split(df, test_size=1-train_ratio, stratify=df[partition_cols])
    
    return train_df, test_df

def vectorize_text_columns(df, text_columns, tfidf_max_features, ngrams):
    # Step 1: Validate columns
    for col in text_columns:
        if col not in df.columns or not pd.api.types.is_string_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of string type")
    
    # Step 2: Vectorize using TF-IDF
    for col in text_columns:
        vectorizer = TfidfVectorizer(max_features=tfidf_max_features, ngram_range=ngrams)
        tfidf_matrix = vectorizer.fit_transform(df[col].values.astype('U'))
        
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'{col}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
        df = pd.concat([df, tfidf_df], axis=1)
    
    return df

def add_z_score(df, target_col, new_col_name):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")

    df[new_col_name] = zscore(df[target_col])
    
    return df

def add_outlier_indicators(df, outlier_std_threshold):
    numeric_cols = df.select_dtypes(include='number').columns.copy()
    
    for col in numeric_cols:
        z_scores = (df[col] - df[col].mean()) / df[col].std()
        outlier_col_name = f'{col}_outliers'
        df[outlier_col_name] = np.where(np.abs(z_scores) > outlier_std_threshold, 1, 0)
    
    return df

def impute_and_reduce(df, numerical_cols, target_col):
    df = df.copy()
    imputer = KNNImputer(n_neighbors=5)
    scaler = StandardScaler()
    pca = PCA(n_components=2)

    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    pca_result = pca.fit_transform(df[numerical_cols])
    
    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
    df.reset_index(drop=True, inplace=True)
    pca_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, pca_df], axis=1)
    
    explained_variance = pca.explained_variance_ratio_
    return df, explained_variance

def augment_stock_features(stock_data, feature_list, window):
    for feature in feature_list:
        rolling_mean = stock_data[feature].rolling(window=window).mean()
        rolling_std = stock_data[feature].rolling(window=window).std()

        stock_data[f"{feature}_rolling_mean"] = rolling_mean
        stock_data[f"{feature}_rolling_std"] = rolling_std

        # Normalize new columns
        stock_data[f"{feature}_rolling_mean"] = (rolling_mean - rolling_mean.min()) / (rolling_mean.max() - rolling_mean.min())
        stock_data[f"{feature}_rolling_std"] = (rolling_std - rolling_std.min()) / (rolling_std.max() - rolling_std.min())
    
    return stock_data
```

File: time_series_transformations.py:

```Python
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def impute_and_clean(df, feature_cols, method):
    # Step 1: Ensure all specified feature columns exist
    for col in feature_cols:
        if col not in df.columns:
            raise ValueError(f"Feature column {col} does not exist in the DataFrame")

    # Step 2: Impute missing values using the specified method
    if method == 'mean':
        df[feature_cols] = df[feature_cols].fillna(df[feature_cols].mean())
    elif method == 'median':
        df[feature_cols] = df[feature_cols].fillna(df[feature_cols].median())
    elif method == 'mode':
        df[feature_cols] = df[feature_cols].fillna(df[feature_cols].mode().iloc[0])
    else:
        raise ValueError("Method must be one of 'mean', 'median', or 'mode'")

    # Step 3: Handle remaining NaN values by removing rows with NaNs
    df.dropna(subset=feature_cols, inplace=True)
    
    return df

def chi_square_analysis(df, target_col, categorical_cols):
    if target_col not in df.columns:
        raise ValueError(f"Target column {target_col} is not present in the DataFrame")
    for col in categorical_cols:
        if col not in df.columns:
            raise ValueError(f"Categorical column {col} is not present in the DataFrame")

    chi2_df = pd.DataFrame(index=categorical_cols, columns=['p_value_transformed'])

    for col in categorical_cols:
        contingency_table = pd.crosstab(df[col], df[target_col])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        chi2_df.at[col, 'p_value_transformed'] = -np.log(p)
    
    return chi2_df

def detect_anomalies(df, target_col, threshold):
    # Step 1: Ensure column presence and type checks
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not present or not numerical")
    
    # Step 2: Compute rolling mean and rolling standard deviation
    df['rolling_mean'] = df[target_col].rolling(window=5).mean()
    df['rolling_std'] = df[target_col].rolling(window=5).std()
    
    # Step 3: Detect anomalies
    df['anomaly'] = abs(df[target_col] - df['rolling_mean']) > threshold * df['rolling_std']
    
    # Drop temporary columns
    df.drop(columns=['rolling_mean', 'rolling_std'], inplace=True)
    
    return df

def add_holiday_indicator(df, datetime_col, holidays):
    if datetime_col not in df.columns:
        raise ValueError(f"Column '{datetime_col}' not found in DataFrame")
    
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    holidays = pd.to_datetime(holidays)
    
    df['is_holiday'] = df[datetime_col].isin(holidays)
    
    return df

def transform_time_series(df, categorical_cols, continuous_cols, time_col):
    # Step 1: Validate column existence and types
    for col in categorical_cols:
        if col not in df.columns:
            raise ValueError(f"Categorical column '{col}' not found in DataFrame")
        if not isinstance(df[col].dtype, (pd.CategoricalDtype, object)):
            raise ValueError(f"Column '{col}' must be categorical")

    for col in continuous_cols:
        if col not in df.columns:
            raise ValueError(f"Continuous column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")

    if time_col not in df.columns:
        raise ValueError(f"Time column '{time_col}' not found in DataFrame")
    
    df = df.copy()

    # Step 2: Parse date column and create new time-related columns
    df[time_col] = pd.to_datetime(df[time_col])
    df['day'] = df[time_col].dt.day
    df['month'] = df[time_col].dt.month
    df['year'] = df[time_col].dt.year
    df['day_of_week'] = df[time_col].dt.dayofweek

    # Step 3: Z-score normalization
    df[continuous_cols] = df[continuous_cols].apply(lambda x: (x - x.mean()) / x.std())

    return df

def grouped_cumulative_features(df, group_by_column, target_column, k=3):
    df = df.sort_values(by=[group_by_column, target_column]).reset_index(drop=True)
    
    grouped = df.groupby(group_by_column)
    
    df[f"{target_column}_cumsum"] = grouped[target_column].cumsum()
    df[f"{target_column}_expanding_mean"] = grouped[target_column].expanding().mean().reset_index(level=0, drop=True)
    df[f"{target_column}_expanding_std"] = grouped[target_column].expanding().std().reset_index(level=0, drop=True)
    
    top_k_indices = grouped[f"{target_column}_cumsum"].nlargest(k).index.get_level_values(1)
    df[f"{target_column}_top_k"] = 0
    df.loc[top_k_indices, f"{target_column}_top_k"] = 1
    
    return df

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points on the earth (specified in decimal degrees) using the Haversine formula.
    """
    R = 6371.0 # Radius of Earth in kilometers
    dlon = np.radians(lon2 - lon1)
    dlat = np.radians(lat2 - lat1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def extract_date_features(df, date_col):
    # Step 1: Validate if date_col contains date-like objects
    try:
        df[date_col] = pd.to_datetime(df[date_col])
    except Exception as e:
        raise ValueError(f"The column '{date_col}' cannot be converted to datetime.\nError: {e}")

    # Step 2: Extract year, month, and day as separate columns
    df[f'{date_col}_year'] = df[date_col].dt.year
    df[f'{date_col}_month'] = df[date_col].dt.month
    df[f'{date_col}_day'] = df[date_col].dt.day

    # Step 3: Create a column for the difference in days from a fixed reference date
    reference_date = pd.Timestamp('2000-01-01')
    df[f'{date_col}_days_from_ref'] = (df[date_col] - reference_date).dt.days
    
    return df

def correlate_numeric_with_category(df, category_col, numerical_col):
    dummies = pd.get_dummies(df[category_col], prefix=category_col)
    correlations = dummies.corrwith(df[numerical_col]).abs()
    
    max_corr_col = correlations.idxmax()
    df[f'correlated_with_{numerical_col}'] = dummies[max_corr_col]
    
    return df

def handle_outliers(df, col_name, outlier_method, threshold=3.0):
    if col_name not in df.columns:
        raise ValueError(f"Column '{col_name}' not found in the DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[col_name]):
        raise ValueError(f"The column '{col_name}' must be numeric")
    
    col_data = df[col_name]
    
    if outlier_method == 'z_score':
        mean = col_data.mean()
        std_dev = col_data.std()
        outliers = (col_data - mean).abs() > (threshold * std_dev)
    elif outlier_method == 'iqr':
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (threshold * IQR)
        upper_bound = Q3 + (threshold * IQR)
        outliers = (col_data < lower_bound) | (col_data > upper_bound)
    else:
        raise ValueError("outlier_method must be either 'z_score' or 'iqr'")
    
    df.loc[outliers, col_name] = np.nan
    df[col_name].fillna(df[col_name].median(), inplace=True)
    
    return df

def create_lag_feature(df, id_col, value_col, lag):
    df = df.copy()

    # Step 1: Create a lag feature for each unique identifier
    df['lagged'] = df.groupby(id_col)[value_col].shift(lag)

    # Step 2: Interpolate missing values using cubic interpolation
    df['lagged'].interpolate(method='cubic', inplace=True)

    # Step 3: Compute the first derivative of the lagged values
    df['lagged_derivative'] = df['lagged'].diff()

    return df
```

File: datetime_extraction.py:

```Python
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def add_hierarchical_cumsum_rank(df, hierarchy_cols, value_col):
    for col in hierarchy_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    if value_col not in df.columns:
        raise ValueError(f"Column '{value_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Column '{value_col}' must be numeric")

    grouped = df.groupby(hierarchy_cols)
    
    df[f'{value_col}_cumsum'] = grouped[value_col].cumsum()
    df[f'{value_col}_rank'] = grouped[value_col].rank()
    
    return df

def interpolate_low_values(df, target_col, low_v_thresh):
    # Step 1: Ensure the column is present and numerical
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not present or not numerical")
    
    # Step 2: Identify low value records
    low_values_mask = df[target_col] < low_v_thresh
    
    # Step 3: Apply interpolation
    df.loc[low_values_mask, target_col] = df[target_col].interpolate()
    
    return df

def extract_datetime_components(df, datetime_col):
    if datetime_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
        raise ValueError(f"Column '{datetime_col}' not found or is not a datetime column in DataFrame")
    
    df = df.copy()
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['day'] = df[datetime_col].dt.day
    df['hour'] = df[datetime_col].dt.hour
    df['minute'] = df[datetime_col].dt.minute

    for col in ['year', 'month', 'day', 'hour', 'minute']:
        if df[col].isna().any():
            raise RuntimeError(f"Generated column '{col}' contains NaN values which is unexpected")
    
    return df

def resample_and_interpolate(df, time_col, new_freq):
    df[time_col] = pd.to_datetime(df[time_col])
    df.set_index(time_col, inplace=True)
    
    resampled_df = df.resample(new_freq).interpolate(method='linear')
    original_index = df.index
    
    resampled_df['was_missing'] = ~resampled_df.index.isin(original_index)
    
    return resampled_df.reset_index()

def numerical_clustering(df, numerical_cols, categories):
    for col in numerical_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must exist and be of numeric type")
            
    df = df.copy()
    for col in numerical_cols:
        kmeans = KMeans(n_clusters=categories, random_state=42)
        df[f'{col}_cluster'] = kmeans.fit_predict(df[[col]])
        
    return df

def encode_and_merge_categorical(df, categorical_threshold):
    categorical_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() / len(df) > categorical_threshold]
    
    one_hot_encoded_cols = pd.get_dummies(df[categorical_cols])
    
    df = df.drop(columns=categorical_cols)
    df = pd.concat([df, one_hot_encoded_cols], axis=1)
    
    return df

def bin_and_compute_statistics(df, category_col, numeric_cols, bin_col, bins):
    # Step 1: Validate columns
    if category_col not in df.columns or not pd.api.types.is_categorical_dtype(df[category_col]):
        raise ValueError(f"Category column {category_col} is not present or not categorical")
    
    for col in numeric_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Numeric column {col} is not present or not numeric")

    if bin_col not in df.columns or not pd.api.types.is_numeric_dtype(df[bin_col]):
        raise ValueError(f"Bin column {bin_col} is not present or not numeric")

    # Step 2: Perform binning and calculate statistics
    df['bin'] = pd.cut(df[bin_col], bins=bins)
    bin_stats = df.groupby('bin')[numeric_cols].agg(['mean', 'std'])
    
    # Flatten columns
    bin_stats.columns = ['_'.join(col).strip() for col in bin_stats.columns.values]
    
    # Step 3: Calculate percentage contribution of each bin's numeric values
    total_values = df[numeric_cols].sum()
    bin_totals = df.groupby('bin')[numeric_cols].sum()
    bin_percentages = bin_totals.div(total_values).multiply(100).add_suffix('_percent')
    
    result_df = pd.concat([bin_stats, bin_percentages], axis=1)
    
    return result_df.reset_index()

def discretize_and_calculate_stats(df, target_col, bins):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    df = df.copy()
    df['quantile_bin'] = pd.qcut(df[target_col], bins)
    
    statistics_df = df.groupby('quantile_bin').agg(['mean', 'std'])
    statistics_df.columns = ['_'.join(col) + '_binned' for col in statistics_df.columns]
    
    return statistics_df.reset_index().rename(columns={'quantile_bin': 'bin'})

def apply_moving_average(df, window_size):
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    
    for col in numeric_cols:
        df[col + '_ma'] = df[col].rolling(window=window_size, min_periods=1).mean()
        
        weights = np.arange(1, window_size + 1)
        weighted_ma = df[col].rolling(window=window_size).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True)
        df[col + '_wma'] = weighted_ma
    
    return df

def cluster_data(df, target_columns, n_clusters):
    # Ensure columns exist and are numeric
    for col in target_columns:
        if col not in df.columns:
            raise ValueError(f"Target column '{col}' is not found in the DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Target column '{col}' must be numeric")

    # Normalize using Min-Max scaling
    scaler = MinMaxScaler()
    df[target_columns] = scaler.fit_transform(df[target_columns])

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    df['cluster'] = kmeans.fit_predict(df[target_columns])

    return df

def filter_with_conditions(df, condition_list):
    df = df.copy()
    
    for condition in condition_list:
        col, op, value = condition.split(':')
        
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        
        if op == 'eq':
            df = df[df[col] == value]
        elif op == 'neq':
            df = df[df[col] != value]
        elif op == 'gt':
            df = df[df[col] > float(value)]
        elif op == 'lt':
            df = df[df[col] < float(value)]
        elif op == 'ge':
            df = df[df[col] >= float(value)]
        elif op == 'le':
            df = df[df[col] <= float(value)]
        else:
            raise ValueError(f"Unknown operation: {op}")
    
    return df
```

--------------------------------------------------------------------------------------------------

Problem Statement: Parameters:
- sales_df: pandas.DataFrame # DataFrame containing sales data which includes 'customer_id' and 'product_id'
- cust_df: pandas.DataFrame # DataFrame containing customer information
- prod_df: pandas.DataFrame # DataFrame containing product information
- target_column: str # The numeric column in sales_df to be binned
- bucket_count: int # The number of bins to create for the target_column
- categorical_cols: list of str # List of categorical columns for frequency encoding

Objectives:
- Confirm that both 'customer_id' and 'product_id' exist in sales_df and are valid.
- Merge sales_df with cust_df on 'customer_id' and prod_df on 'product_id' to include additional customer and product data in sales_df.
- Utilize the 'sales_amount' column in sales_df to compute total sales for each customer and each product line, generating separate summary dataframes for both.
- Validate the target_column parameter to ensure it exists in sales_df and is numeric, then create specified bins for this column.
- Perform frequency encoding on the categorical_cols by replacing original categorical values with their respective frequencies in sales_df.
- Combine the modified sales_df with the summary dataframes generated in step 3 into a final output structure, ensuring clarity and usability of the data for downstream analysis.

Return Values:
- enriched_sales_df: pandas.DataFrame # Modified sales_df containing additional customer, product data, and frequency encoded columns
- customer_sales_summary: pandas.DataFrame # Summary DataFrame showing total sales for each customer
- product_sales_summary: pandas.DataFrame # Summary DataFrame showing total sales for each product

The name of the function you create should be process_sales_data

--------------------------------------------------------------------------------------------------

Answer Code:
def process_sales_data(sales_df, cust_df, prod_df, target_column, bucket_count, categorical_cols):
    # Step 1: Validate required columns in sales_df
    if 'customer_id' not in sales_df.columns or 'product_id' not in sales_df.columns:
        raise ValueError("sales_df must contain 'customer_id' and 'product_id' columns")
    
    # Step 2: Enrich sales_df with customer and product data
    enriched_sales_df, customer_sales_summary, product_sales_summary = sales_enrichment(sales_df, cust_df, prod_df)
    # Step 3: Validate and bin the target_column
    if target_column not in enriched_sales_df.columns:
        raise ValueError(f"Column '{target_column}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(enriched_sales_df[target_column]):
        raise ValueError(f"Column '{target_column}' must be numeric")
    
    enriched_sales_df = bin_numeric_values(enriched_sales_df, target_column, bucket_count)
    # Step 4: Perform frequency encoding on specified categorical columns
    enriched_sales_df = replace_with_frequency(enriched_sales_df, categorical_cols)
    return enriched_sales_df, customer_sales_summary, product_sales_summary

--------------------------------------------------------------------------------------------------

Test Code:
# Import statements if required
import pandas as pd

# Import function from file
from sales_processing import process_sales_data

# Initialize input parameters
sales_df = pd.DataFrame({
    'customer_id': [1, 2, 1, 3],
    'product_id': [101, 102, 103, 101],
    'sales_amount': [100, 200, 150, 300]
})

cust_df = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'customer_name': ['Alice', 'Bob', 'Charlie']
})

prod_df = pd.DataFrame({
    'product_id': [101, 102, 103],
    'product_name': ['Product A', 'Product B', 'Product C']
})

target_column = 'sales_amount'
bucket_count = 3
categorical_cols = ['customer_id', 'product_id']

# Call function with input parameters
return_enriched_sales_df, return_customer_sales_summary, return_product_sales_summary = process_sales_data(
    sales_df, cust_df, prod_df, target_column, bucket_count, categorical_cols
)

# Step-by-step run-through of function to obtain intermediate outputs:

# Step 1
# Explanation: Validate the presence of required columns in sales_df
if 'customer_id' not in sales_df.columns or 'product_id' not in sales_df.columns:
    raise ValueError("sales_df must contain 'customer_id' and 'product_id' columns")

# Step 2
# Explanation: Enrich sales_df with customer and product information
enriched_sales_df = sales_df.merge(cust_df, on='customer_id', how='left')
enriched_sales_df = enriched_sales_df.merge(prod_df, on='product_id', how='left')

# Step 3
# Explanation: Validate and bin the target_column
if target_column not in enriched_sales_df.columns:
    raise ValueError(f"Column '{target_column}' not found in DataFrame")

if not pd.api.types.is_numeric_dtype(enriched_sales_df[target_column]):
    raise ValueError(f"Column '{target_column}' must be numeric")

bins = pd.cut(enriched_sales_df[target_column], bins=bucket_count, labels=[f'bin_{i}' for i in range(1, bucket_count + 1)])
enriched_sales_df[f'{target_column}_bin'] = bins

# Step 4
# Explanation: Perform frequency encoding on specified categorical columns
for col in categorical_cols:
    freq = enriched_sales_df[col].value_counts().to_dict()
    enriched_sales_df[col + '_freq'] = enriched_sales_df[col].map(freq)

# Calculate summaries
customer_sales_summary = enriched_sales_df.groupby('customer_id')['sales_amount'].sum().reset_index()
product_sales_summary = enriched_sales_df.groupby('product_id')['sales_amount'].sum().reset_index()

# Final Expected Output:
correct_enriched_sales_df = enriched_sales_df
correct_customer_sales_summary = customer_sales_summary
correct_product_sales_summary = product_sales_summary

# Assert statements (compulsory) to check if the function returns the correct values:
assert return_enriched_sales_df.equals(correct_enriched_sales_df)
assert return_customer_sales_summary.equals(correct_customer_sales_summary)
assert return_product_sales_summary.equals(correct_product_sales_summary)

print('All-Pass')

--------------------------------------------------------------------------------------------------

