Question No: 4
Context Size: 18729

Codebase:

File: feature_engineering.py:

```Python
import pandas as pd
import numpy as np

def polynomial_feature_selection(df, dependent_var, independent_vars):
    df = df.copy()
    threshold = 0.7
    selected_features = []
    
    for col in independent_vars:
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
        
        for poly_col in [col, f'{col}_squared', f'{col}_cubed']:
            correlation = df[poly_col].corr(df[dependent_var])
            if abs(correlation) >= threshold:
                selected_features.append(poly_col)
    
    return df[selected_features + [dependent_var]]

def count_keywords(df, text_col, keyword_list):
    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found in DataFrame")
    
    df = df.copy()
    
    for keyword in keyword_list:
        col_name = f'count_{keyword}'
        df[col_name] = df[text_col].str.lower().str.count(keyword.lower())
    
    return df

def extract_datetime_components(df, datetime_col):
    if datetime_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
        raise ValueError(f"Column '{datetime_col}' not found or is not a datetime column in DataFrame")
    
    df = df.copy()
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['day'] = df[datetime_col].dt.day
    df['hour'] = df[datetime_col].dt.hour
    df['minute'] = df[datetime_col].dt.minute

    for col in ['year', 'month', 'day', 'hour', 'minute']:
        if df[col].isna().any():
            raise RuntimeError(f"Generated column '{col}' contains NaN values which is unexpected")
    
    return df

def bootstrap_confidence_intervals(df, cat_col, num_col, n_boot=1000, ci=95):
    # Step 1: Validate the categorical and numerical columns
    if cat_col not in df.columns or not pd.api.types.is_categorical_dtype(df[cat_col]):
        raise ValueError(f"Column {cat_col} is not present or not of categorical type")
    if num_col not in df.columns or not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column {num_col} is not present or not of numerical type")
    
    results = []
    
    # Step 2: Perform bootstrap resampling
    for cat_value in df[cat_col].unique():
        sample = df[df[cat_col] == cat_value][num_col].dropna()
        if len(sample) < 2:
            raise ValueError(f"Not enough data for {cat_value} in column {cat_col}")
        
        boot_means = []
        for _ in range(n_boot):
            boot_sample = sample.sample(frac=1, replace=True)
            boot_means.append(boot_sample.mean())
        
        # Calculate confidence intervals
        lower_bound = np.percentile(boot_means, (100 - ci) / 2)
        upper_bound = np.percentile(boot_means, 100 - (100 - ci) / 2)
        
        results.append({
            cat_col: cat_value,
            f'{num_col}_mean': np.mean(boot_means),
            f'{ci}_ci_low': lower_bound,
            f'{ci}_ci_high': upper_bound
        })
    
    return pd.DataFrame(results)

def add_outlier_indicators(df, outlier_std_threshold):
    numeric_cols = df.select_dtypes(include='number').columns.copy()
    
    for col in numeric_cols:
        z_scores = (df[col] - df[col].mean()) / df[col].std()
        outlier_col_name = f'{col}_outliers'
        df[outlier_col_name] = np.where(np.abs(z_scores) > outlier_std_threshold, 1, 0)
    
    return df

def resample_and_aggregate(df, datetime_col, freq):
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    
    resampled_df = df.resample(freq, on=datetime_col).agg(['sum', 'count', 'mean'])
    resampled_df.columns = ['_'.join(col).strip() for col in resampled_df.columns.values]
    resampled_df['duration_days'] = resampled_df.index.to_series().diff().dt.days.fillna(0)
    
    return resampled_df.reset_index()

def handle_outliers(df, col_name, outlier_method, threshold=3.0):
    if col_name not in df.columns:
        raise ValueError(f"Column '{col_name}' not found in the DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[col_name]):
        raise ValueError(f"The column '{col_name}' must be numeric")
    
    col_data = df[col_name]
    
    if outlier_method == 'z_score':
        mean = col_data.mean()
        std_dev = col_data.std()
        outliers = (col_data - mean).abs() > (threshold * std_dev)
    elif outlier_method == 'iqr':
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (threshold * IQR)
        upper_bound = Q3 + (threshold * IQR)
        outliers = (col_data < lower_bound) | (col_data > upper_bound)
    else:
        raise ValueError("outlier_method must be either 'z_score' or 'iqr'")
    
    df.loc[outliers, col_name] = np.nan
    df[col_name].fillna(df[col_name].median(), inplace=True)
    
    return df

def detect_discontinuous_blocks(df, time_col, group_col):
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(by=[group_col, time_col])
    
    df['time_diff'] = df.groupby(group_col)[time_col].diff()
    threshold = pd.Timedelta(days=1)
    df['is_new_block'] = df['time_diff'] > threshold
    
    return df

def compute_cov_and_corr(df, col1, col2):
    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError("One or both columns do not exist in the DataFrame")

    if not pd.api.types.is_numeric_dtype(df[col1]) or not pd.api.types.is_numeric_dtype(df[col2]):
        raise ValueError("Both columns must be of numeric type")

    covariance = df[col1].cov(df[col2])
    correlation = df[col1].corr(df[col2])

    return covariance, correlation

def find_highly_correlated_columns(df, min_corr):
    # Validate that all columns are numeric
    if not all(pd.api.types.is_numeric_dtype(df[col]) for col in df.columns):
        raise ValueError("All columns must be numeric")

    # Calculate correlation matrix
    corr_matrix = df.corr()

    # Identify pairs of columns with correlation exceeding min_corr
    correlated_pairs = []
    for col1 in corr_matrix:
        for col2 in corr_matrix:
            if col1 != col2 and abs(corr_matrix.at[col1, col2]) > min_corr:
                correlated_pairs.append((col1, col2))

    # Remove duplicate pairs (col1, col2) and (col2, col1)
    correlated_pairs = list(set(tuple(sorted(pair)) for pair in correlated_pairs))

    return correlated_pairs

def rolling_window_features(df, window_size, target_col):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be of numeric type")
    
    df = df.copy()
    rolling_window = df[target_col].rolling(window=window_size, min_periods=1)
    
    df[f'{target_col}_rolling_mean'] = rolling_window.mean()
    df[f'{target_col}_rolling_std'] = rolling_window.std().fillna(0)
    df[f'{target_col}_rolling_sum'] = rolling_window.sum()
    df[target_col] = df[target_col].shift(window_size)
    
    return df
```

File: data_normalization.py:

```Python
import pandas as pd
import numpy as np

def impute_with_rolling_mean(df, fill_columns, rolling_window, min_periods):
    rolling_means = df[fill_columns].rolling(window=rolling_window, min_periods=min_periods).mean()
    df[fill_columns] = df[fill_columns].fillna(rolling_means)
    
    for col in fill_columns:
        df[f'{col}_rolling_mean'] = rolling_means[col]
        df[f'{col}_rate_of_change'] = rolling_means[col].diff()
    
    return df

def compute_weighted_averages(df, columns_list, weights):
    df = df.copy()

    # Step 1: Calculate weighted averages
    for column in columns_list:
        if column not in weights:
            raise ValueError(f"No weights provided for column: {column}")

        weighted_avg = df[column] * weights[column]
        weighted_avg_col = f"{column}_weighted_avg"
        df[weighted_avg_col] = weighted_avg.mean()

    # Step 2: Generate deviation columns
    for column in columns_list:
        deviation_col = f"{column}_deviation"
        weighted_avg_col = f"{column}_weighted_avg"
        df[deviation_col] = df[column] - df[weighted_avg_col]

    # Step 3: Summarize deviations
    dev_summary = {}
    for column in columns_list:
        deviation_col = f"{column}_deviation"
        dev_summary[f"{deviation_col}_mean_abs_dev"] = df[deviation_col].abs().mean()

    return df, dev_summary

def filter_outliers_iqr(df, threshold):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].mask((df[col] < lower_bound) | (df[col] > upper_bound), np.nan)
    
    df = df[df.isnull().mean(axis=1) <= threshold]
    
    return df

def add_holiday_indicator(df, datetime_col, holidays):
    if datetime_col not in df.columns:
        raise ValueError(f"Column '{datetime_col}' not found in DataFrame")
    
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    holidays = pd.to_datetime(holidays)
    
    df['is_holiday'] = df[datetime_col].isin(holidays)
    
    return df

def multi_level_groupby(df, group_columns, agg_spec):
    missing_cols = [col for col in group_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns {missing_cols} not found in DataFrame")
    
    df = df.copy()
    grouped_df = df.groupby(group_columns).agg(agg_spec)
    
    return grouped_df.reset_index()

def map_values_to_reference_dict(df, reference_dict):
    for col in reference_dict.keys():
        if col not in df.columns:
            raise ValueError(f"The column '{col}' specified in the reference dictionary is not present in the dataframe")
        df[col] = df[col].map(reference_dict[col]).fillna(np.nan)
    
    return df

def compute_average_time_between_events(df, id_col, event_col, timestamp_col):
    if id_col not in df.columns:
        raise ValueError(f"Column '{id_col}' not found in the DataFrame")
    if event_col not in df.columns:
        raise ValueError(f"Column '{event_col}' not found in the DataFrame")
    if timestamp_col not in df.columns:
        raise ValueError(f"Column '{timestamp_col}' not found in the DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        raise ValueError(f"Column '{timestamp_col}' must be of datetime type")
    
    # Step 2: Sort the DataFrame
    df = df.sort_values(by=[id_col, timestamp_col])
    
    # Step 3: Compute time difference in minutes for consecutive events
    df['time_diff_minutes'] = df.groupby(id_col)[timestamp_col].diff().dt.total_seconds() / 60.0
    
    # Step 4: Compute average time between events
    avg_time_between_events = df.groupby(id_col)['time_diff_minutes'].mean().reset_index()
    avg_time_between_events = avg_time_between_events.rename(columns={'time_diff_minutes': 'avg_time_diff_minutes'})
    
    return avg_time_between_events

def encode_and_merge_categorical(df, categorical_threshold):
    categorical_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() / len(df) > categorical_threshold]
    
    one_hot_encoded_cols = pd.get_dummies(df[categorical_cols])
    
    df = df.drop(columns=categorical_cols)
    df = pd.concat([df, one_hot_encoded_cols], axis=1)
    
    return df

def identify_high_variance_categorical_columns(df, categorical_columns, target_column, variance_threshold):
    if not all(col in df.columns for col in categorical_columns + [target_column]):
        raise ValueError("One or more specified columns are not present in the dataframe")
    
    high_variance_columns = []
    
    for col in categorical_columns:
        variance = df.groupby(col)[target_column].var()
        if variance.max() > variance_threshold:
            high_variance_columns.append(col)
    
    return high_variance_columns

def normalize_and_threshold_binary(df, target_col, threshold):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    df = df.copy()
    
    min_val = df[target_col].min()
    range_val = df[target_col].max() - df[target_col].min()
    
    df[target_col + '_normalized'] = (df[target_col] - min_val) / range_val
    
    df[target_col + '_binary'] = df[target_col + '_normalized'] > threshold
    
    return df

def outlier_detection_and_correction(df, value_col):
    if value_col not in df.columns:
        raise ValueError(f"Column '{value_col}' not found in the DataFrame")
    
    # Step 2: Identify outliers using IQR method
    Q1 = df[value_col].quantile(0.25)
    Q3 = df[value_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df['is_outlier'] = (df[value_col] < lower_bound) | (df[value_col] > upper_bound)
    
    # Step 3: Replace outliers with the median value
    median_value = df[value_col].median()
    df.loc[df['is_outlier'], value_col] = median_value
    
    # Step 4: Return cleaned df and outlier count
    outlier_count = df['is_outlier'].sum()
    return df.drop(columns=['is_outlier']), outlier_count
```

File: text_processing.py:

```Python
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

def create_lag_feature(df, id_col, value_col, lag):
    df = df.copy()

    # Step 1: Create a lag feature for each unique identifier
    df['lagged'] = df.groupby(id_col)[value_col].shift(lag)

    # Step 2: Interpolate missing values using cubic interpolation
    df['lagged'].interpolate(method='cubic', inplace=True)

    # Step 3: Compute the first derivative of the lagged values
    df['lagged_derivative'] = df['lagged'].diff()

    return df

def preprocess_numerical_columns(df, numerical_columns, fill_strategy):
    # Step 1: Validate columns
    for col in numerical_columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of numerical type")

    # Step 2: Handle missing values
    if fill_strategy == 'mean':
        fill_value = df[numerical_columns].mean()
    elif fill_strategy == 'median':
        fill_value = df[numerical_columns].median()
    elif fill_strategy == 'mode':
        fill_value = df[numerical_columns].mode().iloc[0]
    elif fill_strategy == 'constant':
        fill_value = -999  # Assumes a placeholder constant
    else:
        raise ValueError("Invalid fill_strategy specified")

    df[numerical_columns] = df[numerical_columns].fillna(fill_value)

    # Step 3: Normalize using min-max scaling
    df[numerical_columns] = (df[numerical_columns] - df[numerical_columns].min()) / (df[numerical_columns].max() - df[numerical_columns].min())

    # Step 4: Z-score normalization
    for col in numerical_columns:
        df[f'{col}_zscore'] = zscore(df[col])

    return df

def anova_f_score(df, categorical_col, target_col):
    if categorical_col not in df.columns:
        raise ValueError(f"Column {categorical_col} not found in DataFrame")
    if target_col not in df.columns:
        raise ValueError(f"Column {target_col} not found in DataFrame")
    if not pd.api.types.is_string_dtype(df[categorical_col]):
        raise ValueError(f"Column {categorical_col} must be of string type")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} must be of numeric type")
    
    one_hot_enc = OneHotEncoder()
    one_hot_encoded = one_hot_enc.fit_transform(df[[categorical_col]]).toarray()
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_enc.get_feature_names_out([categorical_col]))
    
    f_scores, _ = f_classif(one_hot_df, df[target_col])
    f_score_df = pd.DataFrame({'Variable': one_hot_df.columns, 'F-Score': f_scores})
    
    return f_score_df

def label_early_dates(df, col):
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[col]):
        df[col] = pd.to_datetime(df[col])
    
    date_range_length = (df[col].max() - df[col].min()).days
    threshold_date = df[col].min() + pd.to_timedelta(int(0.1 * date_range_length), unit='d')
    
    df['is_early'] = df[col] <= threshold_date
    
    return df

def discretize_and_calculate_stats(df, target_col, bins):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    df = df.copy()
    df['quantile_bin'] = pd.qcut(df[target_col], bins)
    
    statistics_df = df.groupby('quantile_bin').agg(['mean', 'std'])
    statistics_df.columns = ['_'.join(col) + '_binned' for col in statistics_df.columns]
    
    return statistics_df.reset_index().rename(columns={'quantile_bin': 'bin'})

def groupby_metrics(df, groupby_columns, metrics_dict):
    # Ensure columns for grouping exist
    for col in groupby_columns:
        if col not in df.columns:
            raise ValueError(f"Group by column '{col}' is not found in the DataFrame")
    
    # Group and compute metrics
    grouped = df.groupby(groupby_columns).agg(metrics_dict)
    
    return grouped.reset_index()

def compute_tf_idf(df, text_col, word_list):
    df = df.copy()

    # Step 1: Tokenize the text
    vectorizer = TfidfVectorizer(vocabulary=word_list)
    tf_idf_matrix = vectorizer.fit_transform(df[text_col])

    # Step 2: Extract TF-IDF values
    tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Step 3: Append TF-IDF values to the original DataFrame
    df = pd.concat([df, tf_idf_df], axis=1)

    return df

def replace_high_percentage_with_sum(df, percentage_col, threshold):
    df = df.copy()

    # Step 1: Identify rows where the percentage exceeds the threshold
    high_percentage_rows = df[percentage_col] > threshold

    # Step 2: Calculate the sum of all other numeric columns for these rows
    other_numeric_cols = df.select_dtypes(include=[float, int]).columns.drop(percentage_col)
    sums = df.loc[high_percentage_rows, other_numeric_cols].sum(axis=1)

    # Step 3: Replace the values in 'percentage_col' with the sums
    df.loc[high_percentage_rows, percentage_col] = sums

    return df

def create_interaction_terms(df, categorical_col, target_col):
    # Step 1: Validate 'categorical_col'
    if categorical_col not in df.columns or not pd.api.types.is_categorical_dtype(df[categorical_col]):
        raise ValueError(f"Column '{categorical_col}' must be present and of categorical type in DataFrame")
    
    # Step 2: One-hot encode the categorical column
    one_hot_encoded = pd.get_dummies(df[categorical_col], prefix=categorical_col)
    
    # Step 3: Create interaction terms
    for col in one_hot_encoded.columns:
        df[f'{col}_x_{target_col}'] = one_hot_encoded[col] * df[target_col]
    
    return df

def tfidf_svd(df, text_col):
    df = df.copy()
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_col])
    
    # Reduce dimensionality using SVD
    svd = TruncatedSVD(n_components=2)
    svd_matrix = svd.fit_transform(tfidf_matrix)
    
    df['svd_component_1'] = svd_matrix[:, 0]
    df['svd_component_2'] = svd_matrix[:, 1]
    
    return df

def equal_frequency_binning(df, target_col, n_bins):
    # Validate presence and type of target column
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' is either missing or not numeric in the DataFrame")
    
    # Create equal-frequency bins
    df['bin'] = pd.qcut(df[target_col], q=n_bins, labels=False, duplicates='drop')

    # Calculate mean and std for each bin
    bin_summary = df.groupby('bin')[target_col].agg(['mean', 'std']).reset_index()

    return bin_summary
```

File: scaling_operations.py:

```Python
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def label_encode_and_one_hot(df, target_col, encode_cols):
    # Step 1: Validate presence and types of columns
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' is not found in the DataFrame")
    if not pd.api.types.is_object_dtype(df[target_col]):
        raise ValueError(f"Target column '{target_col}' must be of object type")

    for col in encode_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is not found in the DataFrame")

    # Step 2: Apply label encoding to the target column
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col])

    # Step 3: Apply one-hot encoding to encode columns
    df = pd.get_dummies(df, columns=encode_cols, prefix=encode_cols, drop_first=True)
    
    return df

def vectorize_text_columns(df, text_columns, tfidf_max_features, ngrams):
    # Step 1: Validate columns
    for col in text_columns:
        if col not in df.columns or not pd.api.types.is_string_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of string type")
    
    # Step 2: Vectorize using TF-IDF
    for col in text_columns:
        vectorizer = TfidfVectorizer(max_features=tfidf_max_features, ngram_range=ngrams)
        tfidf_matrix = vectorizer.fit_transform(df[col].values.astype('U'))
        
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'{col}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
        df = pd.concat([df, tfidf_df], axis=1)
    
    return df

def interpolate_low_values(df, target_col, low_v_thresh):
    # Step 1: Ensure the column is present and numerical
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not present or not numerical")
    
    # Step 2: Identify low value records
    low_values_mask = df[target_col] < low_v_thresh
    
    # Step 3: Apply interpolation
    df.loc[low_values_mask, target_col] = df[target_col].interpolate()
    
    return df

def apply_column_transformations(df, column_specs):
    # Step 1: Ensure columns in column_specs are present in the DataFrame
    for col in column_specs:
        if col not in df.columns:
            raise ValueError(f"Column {col} specified in column_specs is not present in the DataFrame")
    
    # Step 2: Apply transformations
    for col, transform in column_specs.items():
        if transform == 'scale':
            scaler = StandardScaler()
            df[col] = scaler.fit_transform(df[[col]])
        elif transform == 'normalize':
            scaler = MinMaxScaler()
            df[col] = scaler.fit_transform(df[[col]])
        elif callable(transform):
            df[col] = df[col].apply(transform)
        else:
            raise ValueError(f"Invalid transformation specified for column {col}")
    
    # Step 3: Ensure no NaN or inf values are generated
    if df.isnull().any().any() or np.isinf(df).any().any():
        raise ValueError("Transformation resulted in NaN or inf values")
    
    return df

def impute_and_reduce(df, numerical_cols, target_col):
    df = df.copy()
    imputer = KNNImputer(n_neighbors=5)
    scaler = StandardScaler()
    pca = PCA(n_components=2)

    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    pca_result = pca.fit_transform(df[numerical_cols])
    
    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
    df.reset_index(drop=True, inplace=True)
    pca_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, pca_df], axis=1)
    
    explained_variance = pca.explained_variance_ratio_
    return df, explained_variance

def category_percentile_summary(df, category_col, numerical_col, target_percentile):
    # Step 1: Calculate percentile for each category
    percentile_values = df.groupby(category_col)[numerical_col].quantile(target_percentile).reset_index()
    
    # Step 2: Identify categories with percentile value exceeding target_percentile
    high_percentiles = percentile_values[percentile_values[numerical_col] > df[numerical_col].quantile(target_percentile)]
    
    # Step 3: Generate summary DataFrame
    summary_df = high_percentiles.rename(columns={numerical_col: f'{target_percentile}_percentile_value'})
    
    # Step 4: Return modified DataFrame and summary DataFrame
    return df, summary_df

def category_based_scaling(df, category_col, num_cols, scaler_type):
    # Step 1: Validate columns
    if category_col not in df.columns:
        raise ValueError("Category column not found in DataFrame")
    
    for col in num_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Numeric column {col} not found or not numeric in DataFrame")
    
    # Step 2: Validate scaler type
    if scaler_type not in ["standard", "minmax"]:
        raise ValueError("Invalid scaler type. Use 'standard' or 'minmax'")

    # Step 3: Apply scaling per category
    df = df.copy()
    metadata = []

    for category in df[category_col].unique():
        category_mask = df[category_col] == category
        if scaler_type == "standard":
            scaler = StandardScaler()
        else:
            scaler = MinMaxScaler()
        
        scaled_values = scaler.fit_transform(df.loc[category_mask, num_cols])
        df.loc[category_mask, num_cols] = scaled_values

        # Step 4: Store metadata
        metadata.append({
            "category": category,
            "scaler": scaler_type,
            "scaled_columns": num_cols
        })
    
    return df, pd.DataFrame(metadata)

def extract_date_features(df, date_col):
    # Step 1: Validate if date_col contains date-like objects
    try:
        df[date_col] = pd.to_datetime(df[date_col])
    except Exception as e:
        raise ValueError(f"The column '{date_col}' cannot be converted to datetime.\nError: {e}")

    # Step 2: Extract year, month, and day as separate columns
    df[f'{date_col}_year'] = df[date_col].dt.year
    df[f'{date_col}_month'] = df[date_col].dt.month
    df[f'{date_col}_day'] = df[date_col].dt.day

    # Step 3: Create a column for the difference in days from a fixed reference date
    reference_date = pd.Timestamp('2000-01-01')
    df[f'{date_col}_days_from_ref'] = (df[date_col] - reference_date).dt.days
    
    return df

def filter_groups_by_mean_threshold(df, group_col, agg_col, threshold):
    # Step 1: Validate columns
    if group_col not in df.columns or agg_col not in df.columns:
        raise ValueError(f"Column {group_col} or {agg_col} is not present in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[agg_col]):
        raise ValueError(f"Column {agg_col} is not numeric")
    
    # Step 2: Calculate group statistics
    group_stats = df.groupby(group_col)[agg_col].agg(['mean', 'std']).reset_index()
    
    # Step 3: Filter groups by mean
    group_stats['filtered'] = group_stats['mean'] > threshold
    df = pd.merge(df, group_stats[[group_col, 'filtered']], on=group_col)
    
    return df

def encode_and_calculate_mean(df, cat_col, target_col, encode_map):
    # Step 1: Validate columns
    if cat_col not in df.columns or not pd.api.types.is_object_dtype(df[cat_col]):
        raise ValueError(f"Column '{cat_col}' must be present and of object type in DataFrame")
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be present and of numeric type in DataFrame")
    
    df = df.copy()
    
    # Step 2: Encode the categorical column
    df[cat_col] = df[cat_col].map(encode_map)
    
    # Step 3: Calculate mean of target column for each group
    target_mean_df = df.groupby(cat_col)[target_col].mean().reset_index(name='target_mean')
    
    # Step 4: Merge with original DataFrame
    df = pd.merge(df, target_mean_df, on=cat_col, how='left')
    
    return df

def label_encode_and_z_score(df, cat_cols, numerical_cols):
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    
    for col in numerical_cols:
        col_mean = df[col].mean()
        col_std = df[col].std()
        df[col] = (df[col] - col_mean) / col_std
        df[col] = df[col].clip(lower=-3, upper=3)
    
    return df
```

File: group_analysis.py:

```Python
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import zscore

def compute_time_differences(df, datetime_col, grp_col):
    if datetime_col not in df.columns:
        raise ValueError(f"Column '{datetime_col}' not found in DataFrame")
    if grp_col not in df.columns:
        raise ValueError(f"Column '{grp_col}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
        raise ValueError(f"Column '{datetime_col}' must be of datetime type")
    
    df = df.copy()
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['day'] = df[datetime_col].dt.day
    
    df['time_diff'] = df.groupby(grp_col)[datetime_col].diff().dt.days
    
    return df

def assign_quantile_labels(df, quantile_ranges, col_to_quantile):
    # Step 1: Ensure specified column exists and is numerical
    if col_to_quantile not in df.columns:
        raise ValueError(f"Column {col_to_quantile} does not exist in DataFrame")
    
    if not np.issubdtype(df[col_to_quantile].dtype, np.number):
        raise ValueError(f"Column {col_to_quantile} is not numerical")

    # Step 2: Divide the column into quantile ranges
    quantiles = df[col_to_quantile].quantile(quantile_ranges).values
    
    # Step 3: Create a new column for quantile labels
    df['quantile_label'] = pd.cut(df[col_to_quantile], bins=quantiles, labels=['Q1', 'Q2', 'Q3', 'Q4'], include_lowest=True)
    
    return df

def consecutive_date_differences(df, date_cols):
    df = df.copy()
    
    # Step 1: Convert date columns to datetime if not already
    for col in date_cols:
        df[col] = pd.to_datetime(df[col])
    
    # Step 2: Calculate differences between consecutive date columns
    for i in range(1, len(date_cols)):
        date_diff_col = f'{date_cols[i-1]}_to_{date_cols[i]}_days'
        df[date_diff_col] = (df[date_cols[i]] - df[date_cols[i-1]]).dt.days
        
        # Step 3: Handle negative differences by setting them to 0
        df[date_diff_col] = df[date_diff_col].apply(lambda x: x if x >= 0 else 0)
    
    return df

def forward_fill_and_interaction(df, columns, step):
    df[columns] = df[columns].ffill(limit=step)
    
    for col in columns:
        df[f'{col}_scaled'] = (df[col] - df[col].mean()) / df[col].std()
        df[f'{col}_log'] = np.log1p(df[col])
    
    interaction_terms = pd.DataFrame(index=df.index)
    for i, col1 in enumerate(columns):
        for col2 in columns[i+1:]:
            interaction_terms[f'{col1}_x_{col2}'] = df[col1] * df[col2]
    
    return pd.concat([df, interaction_terms], axis=1)

def z_score_per_segment(df, segment_col, numerical_cols):
    # Step 1: Validate columns
    for col in [segment_col] + numerical_cols:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not present in the dataframe")
    
    segmented_df = df.copy()

    # Step 2: Calculate z-scores within each segment
    for segment_value in segmented_df[segment_col].unique():
        segment = segmented_df[segmented_df[segment_col] == segment_value]
        for col in numerical_cols:
            z_scores = stats.zscore(segment[col])
            segmented_df.loc[segmented_df[segment_col] == segment_value, f'z_{col}'] = z_scores
    
    return segmented_df

def categorical_summary(df, datetime_column, categories):
    # Step 1: Convert to datetime and extract year, month, day
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    df['year'] = df[datetime_column].dt.year
    df['month'] = df[datetime_column].dt.month
    df['day'] = df[datetime_column].dt.day
    
    # Step 2: Pivot to create a summary table for each category
    summary_tables = {}
    for category in categories:
        category_summary = df.groupby(category).agg({
            col: ['count', 'mean', 'std'] for col in df.select_dtypes(include=['number']).columns
        })
        summary_tables[category] = category_summary
    
    return summary_tables

def sample_and_interact(df, sample_size):
    df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

    # Step 1: Create pairwise feature interactions
    for col1 in df.columns:
        for col2 in df.columns:
            if col1 != col2:
                interaction_col = col1 + '_x_' + col2
                df[interaction_col] = df[col1] * df[col2]

    # Step 2: Standardize interaction terms
    interaction_terms = [col for col in df.columns if '_x_' in col]
    for col in interaction_terms:
        df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df

def mark_rare_categories(df, categorical_columns, rare_threshold):
    if not all(col in df.columns for col in categorical_columns):
        raise ValueError("One or more categorical columns not found in DataFrame")
    
    for col in categorical_columns:
        category_counts = df[col].value_counts(normalize=True)
        rare_categories = category_counts[category_counts < rare_threshold].index
        
        df[col] = df[col].apply(lambda x: 'Rare' if x in rare_categories else x)
    
    return df

def clip_outliers(df, cols_to_clip, lower_quantile, upper_quantile):
    df = df.copy()
    
    for col in cols_to_clip:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
    
    for col in cols_to_clip:
        lower_bound = df[col].quantile(lower_quantile)
        upper_bound = df[col].quantile(upper_quantile)
        df[col] = df[col].clip(lower_bound, upper_bound)
    
    return df

def grouped_timeseries_analysis(df, group_col, time_col, value_col):
    # Sort the dataframe by group_col and time_col
    df = df.sort_values(by=[group_col, time_col])
    
    # Compute the cumulative sum of value_col for each group
    df['cumulative_sum'] = df.groupby(group_col)[value_col].cumsum()
    
    # Calculate the exponentially weighted moving average for value_col within each group
    df['ewm'] = df.groupby(group_col)[value_col].transform(lambda x: x.ewm(span=10, adjust=False).mean())
    
    # Compute Z-scores and remove outliers
    df['z_score'] = df.groupby(group_col)[value_col].transform(lambda x: zscore(x))
    df = df[df['z_score'].abs() <= 3]
    
    return df.drop(columns=['z_score'])

def merge_and_concatenate(df, cols_to_merge, id_col):
    # Validate columns
    missing_columns = [col for col in [id_col] + cols_to_merge if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    # Melt the DataFrame
    melted_df = pd.melt(df, id_vars=[id_col], value_vars=cols_to_merge, var_name='original_col', value_name='value')

    # Pivot and concatenate values
    pivoted_df = melted_df.pivot_table(index=id_col, columns='original_col', values='value', aggfunc=lambda x: ' '.join(str(v) for v in x if pd.notna(v)))
    pivoted_df.columns = [f"{col}_merged" for col in pivoted_df.columns]
    
    # Reset index to have a flat DataFrame
    result_df = pivoted_df.reset_index()
    
    return result_df
```

File: anomaly_detection.py:

```Python
import pandas as pd
from collections import Counter
import re
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
import numpy as np
from sklearn.model_selection import train_test_split

def rolling_avg_multi_patterns(df, col_patterns):
    matching_cols = [col for col in df.columns if any(p in col for p in col_patterns)]
    
    if not matching_cols:
        raise ValueError("No matching columns found")
    
    window_sizes = [3, 5, 10]
    for col in matching_cols:
        for window in window_sizes:
            df[f'{col}_rolling_{window}'] = df[col].rolling(window=window).mean()
    
    return df

def filter_by_thresholds(df, target_col, thresholds):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    if not isinstance(thresholds, dict):
        raise ValueError("Thresholds must be a dictionary with column names as keys and threshold values as values")
    
    original_count = len(df)
    for col, threshold in thresholds.items():
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")

        df = df[df[col] > threshold]
    
    removed_count = original_count - len(df)
    return {"removed_count": removed_count, "filtered_df": df}

def extract_top_words(df, text_column, top_n_words):
    # Step 1: Validate column
    if text_column not in df.columns or not pd.api.types.is_string_dtype(df[text_column]):
        raise ValueError(f"Column {text_column} is not present or not of string type")

    # Step 2: Tokenize the text column
    all_words = df[text_column].apply(lambda x: re.findall(r'\b\w+\b', x.lower())).sum()

    # Step 3: Calculate word frequency and select top 'n' frequent words
    top_words = dict(Counter(all_words).most_common(top_n_words))

    # Step 4: Create binary features for top words
    for word in top_words:
        df[f'contains_{word}'] = df[text_column].apply(lambda x: int(word in x.lower().split()))

    return df

def mark_and_count_holidays(df, date_col, holidays):
    if date_col not in df.columns:
        raise ValueError(f"Column '{date_col}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        raise ValueError(f"Column '{date_col}' must be of datetime type")
    
    holidays = pd.to_datetime(holidays)
    df['is_holiday'] = df[date_col].isin(holidays)
    
    df['weekdays_holiday_count'] = df.apply(
        lambda row: sum(
            (row[date_col].date() == holiday.date()) and (holiday.weekday() < 5)
            for holiday in holidays
        ), axis=1)
    
    return df

def detect_rolling_anomalies(df, date_col, value_col):
    # Validate columns
    missing_columns = [col for col in [date_col, value_col] if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    # Parse dates and sort
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(by=date_col)

    # Rolling calculations
    df['rolling_mean'] = df[value_col].rolling(window=7).mean()
    df['rolling_std'] = df[value_col].rolling(window=7).std()

    # Detect anomalies
    df['anomaly'] = (df[value_col] > (df['rolling_mean'] + 3 * df['rolling_std'])) | (df[value_col] < (df['rolling_mean'] - 3 * df['rolling_std']))

    return df

def robust_scale_and_polynomial_transform(df, feature_cols, target_col):
    # Step 1: Ensure column presence and type checks
    for col in feature_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not numerical")
    
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not present or not numerical")
    
    # Step 2: Apply robust scaling to the feature columns
    scaler = RobustScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    # Step 3: Polynomial feature transformation
    poly = PolynomialFeatures(degree=3, include_bias=False)
    poly_features = poly.fit_transform(df[feature_cols])
    
    # Create a new DataFrame with polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(feature_cols))
    poly_df[target_col] = df[target_col]
    
    return poly_df

def impute_groupby_statistic(df, category_col, value_col, method):
    # Step 1: Validate columns
    if category_col not in df.columns:
        raise ValueError(f"Category column '{category_col}' must be present")
    if value_col not in df.columns or not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Value column '{value_col}' must be present and numeric")
    
    # Step 2: Validate method
    if method not in ["mean", "median"]:
        raise ValueError(f"Method must be either 'mean' or 'median'")
    
    df = df.copy()
    
    # Step 3: Compute the statistic for each group
    if method == "mean":
        imputed_values = df.groupby(category_col)[value_col].transform('mean')
    else:
        imputed_values = df.groupby(category_col)[value_col].transform('median')
    
    # Step 4: Replace original values with imputed values
    df[value_col] = imputed_values
    
    return df

def discretize_by_percentile(df, target_col, percentiles):
    # Step 1: Validate target column
    if target_col not in df.columns or not np.issubdtype(df[target_col].dtype, np.number):
        raise ValueError(f"Target column {target_col} must be present and of numerical type.")
    
    # Step 2: Calculate percentile-based bin edges
    bin_edges = np.percentile(df[target_col], percentiles)
    
    # Step 3: Discretize target column into bins
    bin_labels = range(len(bin_edges) - 1)
    df[f"{target_col}_percentile_bin"] = pd.cut(df[target_col], bins=bin_edges, labels=bin_labels, include_lowest=True)
    
    return df, bin_edges

def rank_within_groups(df, group_by_cols, rank_col):
    # Step 1: Validate columns
    missing_columns = [col for col in group_by_cols + [rank_col] if col not in df.columns]
    if missing_columns:
        raise ValueError(f"These columns are not in the dataframe: {missing_columns}")
    
    # Step 2: Group by specified columns and rank within each group
    df['rank'] = df.groupby(group_by_cols)[rank_col].rank(method='first')
    
    return df

def pivot_and_summarize(df, pivot_columns, value_column, agg_funcs):
    pivot_df = df.pivot_table(index=pivot_columns, values=value_column, aggfunc=agg_funcs)
    
    pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
    pivot_df.reset_index(inplace=True)
    
    df = pd.merge(df, pivot_df, on=pivot_columns, how='left')
    return df

def stratified_sampling(df, sampled_col, stratify_col, train_size):
    if sampled_col not in df.columns or not pd.api.types.is_numeric_dtype(df[sampled_col]):
        raise ValueError(f"Column '{sampled_col}' not found or is not numeric in DataFrame")
    if stratify_col not in df.columns or not pd.api.types.is_categorical_dtype(df[stratify_col]):
        raise ValueError(f"Column '{stratify_col}' not found or is not categorical in DataFrame")
    if not (0 < train_size < 1):
        raise ValueError("train_size must be between 0 and 1")

    df = df.copy()
    
    train_df, test_df = train_test_split(df, train_size=train_size, stratify=df[stratify_col], random_state=42)
    
    return train_df, test_df
```

File: time_series_lagging.py:

```Python
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

def merge_with_handling_duplicates(df, primary_key, join_column, lookup_df):
    if primary_key not in df.columns:
        raise ValueError(f"Primary key '{primary_key}' not found in DataFrame")
    if join_column not in df.columns:
        raise ValueError(f"Join column '{join_column}' not found in DataFrame")
    if join_column not in lookup_df.columns:
        raise ValueError(f"Join column '{join_column}' not found in lookup DataFrame")
    
    df = df.copy()
    merged_df = df.merge(lookup_df, on=join_column, how='left', suffixes=('', '_lookup_dup'))
    
    for col in merged_df.columns:
        if '_lookup_dup' in col:
            original_col = col.replace('_lookup_dup', '')
            if original_col in merged_df.columns:
                merged_df.drop(columns=[original_col], inplace=True)
                merged_df.rename(columns={col: original_col}, inplace=True)
    
    return merged_df

def flag_high_rolling_sums(df, col, threshold):
    rows = df[df[col] > threshold]
    window_size = len(rows)
    
    rolling_sums = rows[col].rolling(window=window_size).sum().reset_index(drop=True)
    
    df['rolling_sum'] = pd.Series(index=rows.index, data=rolling_sums)
    df['rolling_sum_flag'] = df['rolling_sum'] > threshold
    
    return df

def weekly_aggregation_and_rolling_mean(df, date_col, agg_dict):
    # Step 1: Convert date_col to datetime and set as index
    df[date_col] = pd.to_datetime(df[date_col])
    df.set_index(date_col, inplace=True)

    # Step 2: Resample to weekly periods and aggregate 
    weekly_df = df.resample('W').agg(agg_dict)

    # Step 3: Forward fill missing values
    weekly_df.ffill(inplace=True)

    # Step 4: Calculate rolling mean over a 4-week window
    for col in agg_dict.keys():
        weekly_df[f'{col}_4wk_roll_mean'] = weekly_df[col].rolling(window=4).mean()

    return weekly_df

def standardize_and_encode(df, datetime_cols, category_cols):
    # Step 1: Column presence and type checks
    for col in datetime_cols:
        if col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not datetime")

    for col in category_cols:
        if col not in df.columns or not pd.api.types.is_string_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not categorical/text")
    
    # Step 2: Convert datetime columns to UTC
    for col in datetime_cols:
        df[col] = df[col].dt.tz_convert('UTC')

    # Step 3: One-hot encode categorical columns
    df = pd.get_dummies(df, columns=category_cols)

    return df

def compute_composite_score(df, features, diff_window):
    # Validate columns
    missing_columns = [col for col in features if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    diff_cols = []
    for feature in features:
        diff_col_name = f"{feature}_diff_{diff_window}"
        df[diff_col_name] = df[feature].diff(periods=diff_window)
        diff_cols.append(diff_col_name)

    # Standardize differences
    standardized_diffs = (df[diff_cols] - df[diff_cols].mean()) / df[diff_cols].std()
    
    # Composite score
    df['composite_feature_score'] = standardized_diffs.sum(axis=1)

    return df

def summarize_by_intervals(df, datetime_column, n_splits):
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    df = df.sort_values(by=datetime_column)
    
    # Step 1: Discretize the datetime column
    df['interval'] = pd.qcut(df[datetime_column], q=n_splits, duplicates='drop')
    
    # Step 2: Calculate sum of all numerical columns within each interval
    interval_sums = df.groupby('interval').sum(numeric_only=True)
    interval_sizes = df.groupby('interval').size()
    
    # Step 3: Create new dataframe interval_df
    interval_df = pd.DataFrame(interval_sums)
    interval_df['Interval_Size'] = interval_sizes.values

    return interval_df

def sales_enrichment(sales_df, cust_df, prod_df):
    if 'customer_id' not in sales_df.columns or 'product_id' not in sales_df.columns:
        raise ValueError("sales_df must contain 'customer_id' and 'product_id' columns")

    sales_df = sales_df.copy()
    sales_df = sales_df.merge(cust_df, on='customer_id', how='left')
    sales_df = sales_df.merge(prod_df, on='product_id', how='left')
    
    customer_sales_summary = sales_df.groupby('customer_id')['sales_amount'].sum().reset_index()
    product_sales_summary = sales_df.groupby('product_id')['sales_amount'].sum().reset_index()
    
    return sales_df, customer_sales_summary, product_sales_summary

def rename_and_clean_columns(df, col_map):
    # Step 1: Rename columns based on col_map
    df.rename(columns=col_map, inplace=True)
    
    # Step 2: Detect and drop high cardinality columns
    high_card_cols = [col for col in df.columns if df[col].nunique() > 50]
    df.drop(columns=high_card_cols, inplace=True)

    # Step 3: Impute missing values
    for col in df.select_dtypes(include='number').columns:
        df[col].fillna(df[col].median(), inplace=True)
    
    for col in df.select_dtypes(include='object').columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df, high_card_cols

def impute_and_scale(df, numerical_features):
    # Step 1: Validate columns
    for col in numerical_features:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")

    df = df.copy()

    # Step 2: Impute missing values
    for col in numerical_features:
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)

    # Step 3: Scale features
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df

def apply_operation_to_numeric(df, operation):
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    
    transformation_dict = {
        "log": np.log,
        "sqrt": np.sqrt,
        "square": np.square
    }
    
    if operation not in transformation_dict:
        raise ValueError(f"Unsupported operation '{operation}'. Supported operations are: {list(transformation_dict.keys())}")
    
    for col in numeric_cols:
        try:
            df[col] = df[col].apply(transformation_dict[operation])
        except Exception as e:
            print(f"Could not apply operation '{operation}' on column '{col}'. Reason: {e}")
    
    return df

def create_lagged_features(df, target_column, lag_periods):
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_column]):
        raise ValueError(f"Target column '{target_column}' must be numeric")

    # Step 2: Create lagged versions of target_column
    for lag in range(1, lag_periods + 1):
        df[f'{target_column}_lag_{lag}'] = df[target_column].shift(lag)
    
    # Step 3: Drop rows with NaN values due to lagging
    df = df.dropna().reset_index(drop=True)
    
    return df
```

File: timeseries_enhancement.py:

```Python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

def handle_categorical_and_numerical(df, cat_cols, num_cols):
    df = df.copy()
    
    for col in cat_cols:
        if col not in df.columns or not df[col].dtype.name == 'category':
            raise ValueError(f"Column '{col}' must be categorical and exist in DataFrame")
    
    for col in num_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric and exist in DataFrame")
    
    df = pd.get_dummies(df, columns=cat_cols)
    
    for col in num_cols:
        col_mean = df[col].mean()
        col_std = df[col].std()
        col_median = df[col].median()
        
        df[col] = df[col].apply(lambda x: col_median if abs(x - col_mean) > 3 * col_std else x)
    
    return df

def row_wise_percent_change(df, target_cols, new_col, offset=1.0):
    missing_columns = [col for col in target_cols if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the dataframe: {missing_columns}")
    
    if not all(pd.api.types.is_numeric_dtype(df[col]) for col in target_cols):
        raise ValueError("All target columns must be numeric")
    
    pct_changes = (df[target_cols].pct_change(axis=1) + offset).dropna(axis=1, how='all')
    
    df[new_col] = pct_changes.mean(axis=1)
    
    return df

def standardize_handle_pca(df, cols, strategies):
    # Step 1: Standardize columns using z-score normalization
    scaler = StandardScaler()
    df[cols] = scaler.fit_transform(df[cols])

    # Step 2: Handle missing values according to specified strategies
    for col, strat in strategies.items():
        if strat == 'mean':
            df[col].fillna(df[col].mean(), inplace=True)
        elif strat == 'median':
            df[col].fillna(df[col].median(), inplace=True)
        elif strat == 'mode':
            df[col].fillna(df[col].mode()[0], inplace=True)

    # Step 3: Perform PCA transformation
    pca = PCA()
    principal_components = pca.fit_transform(df[cols])
    df_pca = pd.DataFrame(data=principal_components, columns=[f'pc{i+1}' for i in range(principal_components.shape[1])])

    df.reset_index(drop=True, inplace=True)
    df_pca.reset_index(drop=True, inplace=True)
    df = pd.concat([df, df_pca], axis=1)

    return df

def temporal_resampling_and_rolling_stats(df, time_col, freq):
    if time_col not in df.columns:
        raise ValueError(f"Column {time_col} is not in the dataframe.")
    
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    
    df.set_index(time_col, inplace=True)
    df = df.resample(freq).ffill()
    
    rolling_mean = df.rolling(window=7).mean()
    rolling_std = df.rolling(window=7).std()
    
    df['rolling_mean'] = rolling_mean
    df['rolling_std'] = rolling_std
    
    df.reset_index(inplace=True)
    
    return df

def remove_duplicates_and_count(df, target_col):
    # Identify duplicates
    duplicates = df.duplicated(subset=target_col, keep=False)
    df['duplicate_count'] = 0
    
    # Count duplicates and mark them
    df.loc[duplicates, 'duplicate_count'] = df.loc[duplicates, target_col].map(df.loc[duplicates, target_col].value_counts())
    
    # Drop duplicate rows, keeping the first occurrence
    df = df.drop_duplicates(subset=target_col, keep='first')
    
    return df

def transaction_features(transactions, id_col, timestamp_col, amount_col):
    # Ensure timestamp_col is in datetime format
    transactions[timestamp_col] = pd.to_datetime(transactions[timestamp_col])
    
    # Ensure amount_col is numeric
    if not pd.api.types.is_numeric_dtype(transactions[amount_col]):
        raise ValueError(f"Column '{amount_col}' must be numeric")
    
    # Create time-based features
    transactions['day_of_week'] = transactions[timestamp_col].dt.dayofweek
    transactions['month'] = transactions[timestamp_col].dt.month
    transactions['hour'] = transactions[timestamp_col].dt.hour
    
    # Group and calculate cumulative sum and rolling sum
    transactions = transactions.sort_values(by=[id_col, timestamp_col])
    transactions['cumulative_sum'] = transactions.groupby(id_col)[amount_col].cumsum()
    transactions['rolling_sum'] = transactions.groupby(id_col)[amount_col].rolling(window=5).sum().reset_index(level=0, drop=True)
    
    return transactions

def user_action_time_analysis(df, user_col, action_col, time_col):
    if user_col not in df.columns or not pd.api.types.is_string_dtype(df[user_col]):
        raise ValueError(f"Column '{user_col}' is not a string column in DataFrame")
    if action_col not in df.columns or not pd.api.types.is_string_dtype(df[action_col]):
        raise ValueError(f"Column '{action_col}' is not a string column in DataFrame")
    if time_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[time_col]):
        raise ValueError(f"Column '{time_col}' is not a datetime column in DataFrame")
    
    df = df.sort_values(by=[user_col, time_col])
    df['time_diff_seconds'] = df.groupby(user_col)[time_col].diff().dt.total_seconds().fillna(0)

    df['cumulative_time_seconds'] = df.groupby(user_col)['time_diff_seconds'].cumsum()

    return df

def cluster_data(df, target_columns, n_clusters):
    # Ensure columns exist and are numeric
    for col in target_columns:
        if col not in df.columns:
            raise ValueError(f"Target column '{col}' is not found in the DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Target column '{col}' must be numeric")

    # Normalize using Min-Max scaling
    scaler = MinMaxScaler()
    df[target_columns] = scaler.fit_transform(df[target_columns])

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    df['cluster'] = kmeans.fit_predict(df[target_columns])

    return df

def validate_foreign_keys(df, primary_key, fk_dictionary):
    # Step 1: Ensure primary key column and foreign key columns exist
    if primary_key not in df.columns:
        raise ValueError(f"Primary key column {primary_key} does not exist in DataFrame")

    for fk_col, fk_table in fk_dictionary.items():
        if fk_col not in df.columns:
            raise ValueError(f"Foreign key column {fk_col} does not exist in DataFrame")

    # Step 2: Validate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        if not df[fk_col].isin(fk_table[primary_key]).all():
            print(f"Foreign key constraint violated on column {fk_col}")

    # Step 3: Remove rows that violate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        valid_rows = df[fk_col].isin(fk_table[primary_key])
        df = df[valid_rows]

    return df

def time_series_enhancement(df, date_col, numeric_col):
    # Step 1: Parse date_col into datetime and set as index
    df[date_col] = pd.to_datetime(df[date_col])
    df.set_index(date_col, inplace=True)
    
    # Step 2: Resample to daily frequency and interpolate missing values using spline interpolation
    df = df.resample('D').asfreq()
    df[numeric_col] = df[numeric_col].interpolate(method='spline', order=2)
    
    # Step 3: Calculate autocorrelation of numeric_col over lags 1 to 7 days
    autocorrs = {f'{numeric_col}_autocorr_lag{lag}': df[numeric_col].autocorr(lag) for lag in range(1, 8)}
    for key, value in autocorrs.items():
        df[key] = value
    
    # Step 4: Create a cyclical time feature for day of the year
    day_of_year = df.index.dayofyear
    df['day_of_year_sin'] = np.sin(2 * np.pi * day_of_year / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * day_of_year / 365)
    
    return df

def zscore_threshold_summary(df, value_col, threshold):
    if not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Column '{value_col}' must be numeric")
    
    # Calculate z-scores
    mean_val = df[value_col].mean()
    std_val = df[value_col].std()
    df['z_score'] = (df[value_col] - mean_val) / std_val
    
    # Identify rows exceeding z-score threshold
    df['exceeds_threshold'] = (df['z_score'].abs() > threshold).astype(int)
    
    # Create summary DataFrame
    summary = df.groupby('exceeds_threshold').size().reset_index(name='count')
    summary['percentage'] = (summary['count'] / len(df)) * 100
    
    return summary
```

File: time_series.py:

```Python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

def adjust_target_with_bias(df, target_col, bias_factor):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")

    # Calculate mean and standard deviation
    mean_val = df[target_col].mean()
    std_val = df[target_col].std()

    # Adjust target column with bias
    df[f'adjusted_{target_col}'] = df[target_col] + (std_val * bias_factor)

    return df

def bin_dataframes_columns(df, bin_edges):
    for col in bin_edges.keys():
        if col not in df.columns:
            raise ValueError(f"The column '{col}' specified in bin edges is not present in the dataframe")
        
        bins, labels = bin_edges[col]
        df[f'{col}_bin'] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    
    return df

def bin_features_and_replace_labels(df, feature_col, label_col):
    # Step 1: Ensure required columns are present
    if feature_col not in df.columns or label_col not in df.columns:
        raise ValueError(f"Columns '{feature_col}' and/or '{label_col}' not found in DataFrame")
    
    df = df.copy()

    # Step 2: Bin feature_col into quartiles
    df[f"{feature_col}_bin"] = pd.qcut(df[feature_col], 4, labels=False)

    # Step 3: Calculate mean of label_col for each bin
    bin_means = df.groupby(f"{feature_col}_bin")[label_col].mean().to_dict()

    # Step 4: Replace label_col values with bin means
    df[label_col] = df[f"{feature_col}_bin"].map(bin_means)

    return df, bin_means

def time_series_interpolation(df, time_col, freq):
    # Step 1: Create complete time index
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.set_index(time_col).asfreq(freq)

    # Step 2: Interpolate missing values in numerical columns
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].interpolate(method='linear')

    # Step 3: Compute rolling statistics with a window size of 5 time periods
    for col in numeric_cols:
        df[f"{col}_rolling_mean"] = df[col].rolling(window=5).mean()
        df[f"{col}_rolling_std"] = df[col].rolling(window=5).std()

    return df.reset_index()

def group_proportion_sums(df, group_col, operation_col):
    # Step 1: Validate columns
    if group_col not in df.columns or not pd.api.types.is_categorical_dtype(df[group_col]):
        raise ValueError(f"Column {group_col} is not present or not of categorical type")
    if operation_col not in df.columns or not pd.api.types.is_numeric_dtype(df[operation_col]):
        raise ValueError(f"Column {operation_col} is not present or not of numeric type")

    # Step 2: Group by, Sum
    group_sums = df.groupby(group_col)[operation_col].sum()
    
    # Step 3: Normalize sums to proportions
    total_sum = group_sums.sum()
    group_proportions = group_sums / total_sum

    # Step 4: Return result
    return group_proportions.to_dict()

def hierarchical_aggregation_and_scaling(df, cat_col, agg_dict):
    agg_df = df.groupby(cat_col).agg(agg_dict)
    
    hierarchical_index = agg_df.index
    agg_df = agg_df.reset_index()
    agg_df.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in agg_df.columns.values]
    
    scaler = StandardScaler()
    agg_df_scaled = scaler.fit_transform(agg_df.iloc[:, 1:])
    
    standardized_df = pd.DataFrame(agg_df_scaled, columns=agg_df.columns[1:], index=hierarchical_index)
    
    return standardized_df

def preprocess_categorical_and_continuous(df, categorical_cols, continuous_cols):
    for col in categorical_cols:
        if col not in df.columns:
            raise ValueError(f"Categorical column '{col}' not found in DataFrame")
        if not pd.api.types.is_categorical_dtype(df[col]) and not pd.api.types.is_object_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be categorical")

    for col in continuous_cols:
        if col not in df.columns:
            raise ValueError(f"Continuous column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")

    df = df.copy()
    
    # Convert categorical columns to one-hot encoding
    df = pd.get_dummies(df, columns=categorical_cols)

    # Scale continuous columns
    scaler = MinMaxScaler()
    df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

    return df

def generate_n_grams(df, text_col, n_grams, min_freq):
    vectorizer = CountVectorizer(ngram_range=(1, n_grams), min_df=min_freq)
    n_gram_matrix = vectorizer.fit_transform(df[text_col])
    n_gram_freq = n_gram_matrix.sum(axis=0).A1
    n_grams_df = pd.DataFrame({'n_gram': vectorizer.get_feature_names_out(), 'frequency': n_gram_freq})
    
    return n_grams_df

def replace_with_frequency(df, cat_cols):
    df = df.copy()
    
    for col in cat_cols:
        freq = df[col].value_counts().to_dict()
        df[col + '_freq'] = df[col].map(freq)
    
    return df

def add_top_n_word_counts(df, text_col, n):
    if text_col not in df.columns:
        raise ValueError(f"Column {text_col} not found in DataFrame")
    if not pd.api.types.is_string_dtype(df[text_col]):
        raise ValueError(f"Column {text_col} must be of string type")
    
    vectorizer = CountVectorizer(max_features=n)
    word_counts = vectorizer.fit_transform(df[text_col])
    top_words = vectorizer.get_feature_names_out()
    
    word_count_df = pd.DataFrame(word_counts.toarray(), columns=top_words)
    df = pd.concat([df.reset_index(drop=True), word_count_df.reset_index(drop=True)], axis=1)
    
    return df

def compute_interactions(df, interaction_columns, method):
    # Validate presence and type of each column in interaction_columns
    for col in interaction_columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not numeric in DataFrame")
    
    # Compute pairwise interactions based on specified method
    if method not in ['sum', 'multiply']:
        raise ValueError(f"Method '{method}' not supported. Use 'sum' or 'multiply'.")

    for i in range(len(interaction_columns)):
        for j in range(i + 1, len(interaction_columns)):
            col1, col2 = interaction_columns[i], interaction_columns[j]
            if method == 'sum':
                df[f'{col1}_{col2}_sum'] = df[col1] + df[col2]
            elif method == 'multiply':
                df[f'{col1}_{col2}_multiply'] = df[col1] * df[col2]
    
    return df
```

File: category_aggregator.py:

```Python
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

def merge_on_columns(df, aux_df, primary_col, foreign_col):
    if primary_col not in df.columns:
        raise ValueError(f"Column {primary_col} not found in DataFrame")
    if foreign_col not in aux_df.columns:
        raise ValueError(f"Column {foreign_col} not found in auxiliary DataFrame")
    
    merged_df = df.merge(aux_df, left_on=primary_col, right_on=foreign_col, how='left')
    
    # Handle null values post-merge
    merged_df.fillna(value={'foreign_col': 'Unknown'}, inplace=True)
    
    return merged_df

def create_frequency_table(df, categorical_columns):
    missing_cols = [col for col in categorical_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Columns {missing_cols} not found in DataFrame")
    non_categorical_cols = [col for col in categorical_columns if not pd.api.types.is_categorical_dtype(df[col])]
    if non_categorical_cols:
        raise ValueError(f"Columns {non_categorical_cols} must be of categorical type")
    
    freq_table = pd.DataFrame()
    for col in categorical_columns:
        freq = df[col].value_counts().reset_index()
        freq.columns = [col, 'frequency']
        freq['proportion'] = freq['frequency'] / len(df)
        freq_table = pd.concat([freq_table, freq], axis=0)
    
    return freq_table.reset_index(drop=True)

def compute_year_over_year_growth(df, date_col, group_col, value_col):
    # Step 1: Extract month and year from date column
    df[date_col] = pd.to_datetime(df[date_col])
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    
    # Step 2: Compute monthly sum of value_col
    monthly_sum = df.groupby(['year', 'month', group_col])[value_col].sum().reset_index()
    
    # Step 3: Calculate year-over-year growth
    monthly_sum['previous_year'] = monthly_sum['year'] - 1
    monthly_sum = monthly_sum.merge(monthly_sum, on=['previous_year', 'month', group_col], suffixes=('', '_previous'))
    
    monthly_sum['year_over_year_growth'] = ((monthly_sum[value_col] - monthly_sum[value_col + '_previous']) / 
                                            monthly_sum[value_col + '_previous']) * 100
    
    # Step 4: Clean up result
    monthly_sum = monthly_sum[['year', 'month', group_col, 'year_over_year_growth']]
    df = df.merge(monthly_sum, on=['year', 'month', group_col], how='left')
    
    return df

def filter_and_aggregate_categories(df, cat_col, agg_col, threshold):
    if cat_col not in df.columns:
        raise ValueError(f"Column '{cat_col}' not found in DataFrame")
    if agg_col not in df.columns:
        raise ValueError(f"Column '{agg_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[agg_col]):
        raise ValueError(f"Column '{agg_col}' must be numeric")
    
    category_counts = df[cat_col].value_counts()
    filtered_categories = category_counts[category_counts > threshold].index
    
    agg_values = df[df[cat_col].isin(filtered_categories)].groupby(cat_col)[agg_col].sum().reset_index()
    for _, row in agg_values.iterrows():
        df.loc[df[cat_col] == row[cat_col], f'{agg_col}_sum'] = row[agg_col]
    
    return df

def cumulative_sum_post_sort(df, sort_column):
    sorted_df = df.sort_values(by=sort_column).reset_index(drop=True)
    
    cumsum_df = sorted_df.copy()
    numeric_columns = sorted_df.select_dtypes(include='number').columns
    
    for col in numeric_columns:
        cumsum_df[f'cumsum_{col}'] = sorted_df[col].cumsum()
    
    return cumsum_df

def calculate_ewma_ewmsd(df, window_size, numeric_cols):
    for col in numeric_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not numeric in DataFrame")
    
    df = df.copy()
    
    for col in numeric_cols:
        df[f"{col}_ewma"] = df[col].ewm(span=window_size, adjust=False).mean()
        df[f"{col}_ewmsd"] = df[col].ewm(span=window_size, adjust=False).std()
    
    return df

def conditional_encoding(df, category_col, target_col):
    df = df.copy()
    
    # Encode category column by frequency
    freq_encoding = df[category_col].value_counts().to_dict()
    df[f'{category_col}_encoded'] = df[category_col].map(freq_encoding)
    
    # Scale target column within each encoded group
    scaler = StandardScaler()
    df[target_col] = df.groupby(f'{category_col}_encoded')[target_col].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())
    
    # Create dummy variables for the encoded category column
    dummies = pd.get_dummies(df[f'{category_col}_encoded'], prefix=category_col)
    df = pd.concat([df, dummies], axis=1)
    
    return df

def equal_width_binning(df, col_to_bin, n_bins):
    df = df.copy()
    
    bin_labels = range(n_bins)
    df[col_to_bin + '_binned'] = pd.cut(df[col_to_bin], bins=n_bins, labels=bin_labels)
    
    return df

def compute_pairwise_correlations(df, col_pairs):
    for col1, col2 in col_pairs:
        if col1 not in df.columns or col2 not in df.columns:
            raise ValueError(f"Columns '{col1}' or '{col2}' not found in DataFrame")
        if not (pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2])):
            raise ValueError(f"Columns '{col1}' and '{col2}' must be numeric")
    
    correlation_data = {
        f"{col1}_{col2}_correlation": df[[col1, col2]].corr().iloc[0, 1]
        for col1, col2 in col_pairs
    }

    correlation_df = pd.DataFrame([correlation_data])
    
    return correlation_df

def weighted_sum_of_scaled_cols(df, numerical_cols, new_col_name):
    # Step 1: Validate existence and type of numerical columns
    for col in numerical_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of numeric type")
    
    # Step 2: Min-max scale each numerical column to range [0, 1]
    scaled_cols = (df[numerical_cols] - df[numerical_cols].min()) / (df[numerical_cols].max() - df[numerical_cols].min())
    
    # Step 3: Compute a weighted sum of scaled values based on variances as weights
    variances = df[numerical_cols].var()
    weights = variances / variances.sum()
    weighted_sum = scaled_cols.dot(weights)
    
    # Step 4: Return the dataframe with the new weighted sum column
    df[new_col_name] = weighted_sum
    return df

def flag_time_gaps(df, datetime_col, interval):
    # Step 1: Validate column
    if datetime_col not in df.columns:
        raise ValueError(f"Column {datetime_col} is not present in DataFrame")
    
    # Step 2: Ensure datetime format
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    
    # Step 3: Calculate time differences
    df['time_diff'] = df[datetime_col].diff().dt.total_seconds()
    
    # Step 4: Create flag for time gaps
    interval_seconds = pd.Timedelta(interval).total_seconds()
    df['gap_flag'] = df['time_diff'] > interval_seconds
    
    return df
```

--------------------------------------------------------------------------------------------------

Problem Statement: Parameters:
- df: pandas.DataFrame # The input DataFrame containing various data for processing.
- window_size: int # The size of the rolling window for calculations.
- target_col: str # The name of the target column for normalization and lagging.
- threshold: float # The threshold value used in multiple operations.
- cat_col: str # The name of the category column for aggregation based on frequency.
- agg_col: str # The name of the column to be aggregated.

Objectives:
- Validate the presence and type of `target_col` to ensure it is numeric, raising an error if not.
- Normalize `target_col` using its minimum and range, creating a new normalized column named `target_col_normalized`.
- Create a binary column, `target_col_binary`, that indicates whether the normalized value exceeds the specified threshold.
- Call a function to compute rolling window features on `target_col` using the specified `window_size`, storing these values as new columns.
- Identify unique categories in `cat_col` that appear more frequently than the threshold, storing them in a list called `valid_categories`.
- If there are any valid categories, aggregate `agg_col` for categories present in `valid_categories`, appending the aggregated values back into the original DataFrame as new columns with the suffix `_sum`.
- If no valid categories are found, default the aggregated sum to 0 for non-valid categories, ensuring all operations are reflected in the modified DataFrame.

Return Values:
- df: pandas.DataFrame # The modified DataFrame containing the added columns from normalization, binary creation, rolling features, and aggregation.

The name of the function you create should be complex_data_processing

--------------------------------------------------------------------------------------------------

Answer Code:
def complex_data_processing(df, window_size, target_col, threshold, cat_col, agg_col):
    # Step 1: Validate target_col presence and type
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    # Step 2: Normalize target_col
    df = normalize_and_threshold_binary(df, target_col, threshold)
    # Step 3: Create rolling window features
    df = rolling_window_features(df, window_size, target_col)
    # Step 4: Identify valid categories
    category_counts = df[cat_col].value_counts()
    valid_categories = category_counts[category_counts > threshold].index.tolist()
    # Step 5: Aggregate agg_col for valid categories
    if valid_categories:
        df = filter_and_aggregate_categories(df, cat_col, agg_col, threshold)
    else:
        # Step 6: Default the aggregated sum to 0 for non-valid categories
        df[f'{agg_col}_sum'] = 0
    
    return df

--------------------------------------------------------------------------------------------------

Test Code:
# Import necessary libraries
import pandas as pd
import numpy as np

# Import function from file
from feature_enhancement import complex_data_processing

# Sample data for DataFrame
data = {
    'A': [1, 1, 2, 2, 3, 3, 4, 4],  # Category Column
    'B': [10, 20, 30, 40, 50, 60, 70, 80],  # Numeric Column to be aggregated
    'C': [1, 2, 3, 4, 5, 6, 7, 8]  # Numeric Column for rolling window
}

df = pd.DataFrame(data)

# Initialize input parameters
window_size = 3
target_col = 'C'
threshold = 2
cat_col = 'A'
agg_col = 'B'

# Call function with input parameters
return_df = complex_data_processing(df, window_size, target_col, threshold, cat_col, agg_col)

# Step-by-step run-through of function to obtain intermediate outputs:

# Step 1
# Explanation: Validate target_col presence and type
if target_col not in df.columns:
    raise ValueError(f"Column '{target_col}' not found in DataFrame")
if not pd.api.types.is_numeric_dtype(df[target_col]):
    raise ValueError(f"Column '{target_col}' must be numeric")

# Step 2
# Explanation: Normalize target_col
min_val = df[target_col].min()
range_val = df[target_col].max() - df[target_col].min()
df[target_col + '_normalized'] = (df[target_col] - min_val) / range_val
df[target_col + '_binary'] = df[target_col + '_normalized'] > threshold

# Step 3
# Explanation: Create rolling window features
rolling_window = df[target_col].rolling(window=window_size, min_periods=1)
df[f'{target_col}_rolling_mean'] = rolling_window.mean()
df[f'{target_col}_rolling_std'] = rolling_window.std().fillna(0)
df[f'{target_col}_rolling_sum'] = rolling_window.sum()
df[target_col] = df[target_col].shift(window_size)

# Step 4
# Explanation: Identify and filter valid categories
category_counts = df[cat_col].value_counts()
valid_categories = category_counts[category_counts > threshold].index.tolist()

# Step 5
# Explanation: Aggregate agg_col for valid categories 
if valid_categories:
    agg_values = df[df[cat_col].isin(valid_categories)].groupby(cat_col)[agg_col].sum().reset_index()
    for _, row in agg_values.iterrows():
        df.loc[df[cat_col] == row[cat_col], f'{agg_col}_sum'] = row[agg_col]
else:
    # Step 6: Default the aggregated sum to 0 for non-valid categories
    df[f'{agg_col}_sum'] = 0

# Final Expected Output
correct_df = df

# Assert statements (compulsory) to check if the function returns the correct values:
assert return_df.equals(correct_df)

print('All-Pass')

--------------------------------------------------------------------------------------------------

