Question No: 8
Context Size: 18225

Codebase:

File: outlier_handler.py:

```Python
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import f_classif
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MinMaxScaler

def resample_and_aggregate(df, timestamp_column, freq):
    if timestamp_column not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[timestamp_column]):
        raise ValueError("'timestamp_column' must be present and in datetime format")
    
    df = df.set_index(timestamp_column)
    df_resampled = df.resample(freq).agg(['sum', 'mean', 'max'])
    
    return df_resampled

def shift_columns(df, cols_to_shift, periods, fill_value):
    for col in cols_to_shift:
        if col not in df.columns:
            raise ValueError(f"Column {col} not found in DataFrame")
    
    for col in cols_to_shift:
        df[col] = df[col].shift(periods)
        if fill_value is not None:
            df[col].fillna(fill_value, inplace=True)
    
    return df

def anova_f_score(df, categorical_col, target_col):
    if categorical_col not in df.columns:
        raise ValueError(f"Column {categorical_col} not found in DataFrame")
    if target_col not in df.columns:
        raise ValueError(f"Column {target_col} not found in DataFrame")
    if not pd.api.types.is_string_dtype(df[categorical_col]):
        raise ValueError(f"Column {categorical_col} must be of string type")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} must be of numeric type")
    
    one_hot_enc = OneHotEncoder()
    one_hot_encoded = one_hot_enc.fit_transform(df[[categorical_col]]).toarray()
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_enc.get_feature_names_out([categorical_col]))
    
    f_scores, _ = f_classif(one_hot_df, df[target_col])
    f_score_df = pd.DataFrame({'Variable': one_hot_df.columns, 'F-Score': f_scores})
    
    return f_score_df

def normalize_skewed_columns(df, columns):
    skewness_vals = []
    kurtosis_vals = []
    
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not in the dataframe")
        
        # Calculate skewness and kurtosis
        skewness = df[col].skew()
        kurtosis = df[col].kurtosis()
        
        skewness_vals.append((col, skewness))
        kurtosis_vals.append((col, kurtosis))
        
        # Step 3: Normalization if skewness is high
        if abs(skewness) > 1:
            df[col], _ = stats.boxcox(df[col].dropna() + 1)  # Adding 1 to shift values to positive for Box-Cox
    
    return df, skewness_vals, kurtosis_vals

def normalize_column_with_null_handling(df, column, new_range, null_strategy):
    min_val, max_val = df[column].min(), df[column].max()
    new_min, new_max = new_range

    if null_strategy == "mean":
        fill_value = df[column].mean()
    elif null_strategy == "median":
        fill_value = df[column].median()
    elif null_strategy == "drop":
        df = df.dropna(subset=[column])
        fill_value = None
    else:
        raise ValueError("Invalid null_strategy. Use 'mean', 'median', or 'drop'.")

    df[f'{column}_missing'] = df[column].isna().astype(int)
    
    if fill_value is not None:
        df[column].fillna(fill_value, inplace=True)
    
    df[column] = ((df[column] - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min

    return df

def identify_high_variance_categorical_columns(df, categorical_columns, target_column, variance_threshold):
    if not all(col in df.columns for col in categorical_columns + [target_column]):
        raise ValueError("One or more specified columns are not present in the dataframe")
    
    high_variance_columns = []
    
    for col in categorical_columns:
        variance = df.groupby(col)[target_column].var()
        if variance.max() > variance_threshold:
            high_variance_columns.append(col)
    
    return high_variance_columns

def merge_with_handling_duplicates(df, primary_key, join_column, lookup_df):
    if primary_key not in df.columns:
        raise ValueError(f"Primary key '{primary_key}' not found in DataFrame")
    if join_column not in df.columns:
        raise ValueError(f"Join column '{join_column}' not found in DataFrame")
    if join_column not in lookup_df.columns:
        raise ValueError(f"Join column '{join_column}' not found in lookup DataFrame")
    
    df = df.copy()
    merged_df = df.merge(lookup_df, on=join_column, how='left', suffixes=('', '_lookup_dup'))
    
    for col in merged_df.columns:
        if '_lookup_dup' in col:
            original_col = col.replace('_lookup_dup', '')
            if original_col in merged_df.columns:
                merged_df.drop(columns=[original_col], inplace=True)
                merged_df.rename(columns={col: original_col}, inplace=True)
    
    return merged_df

def outlier_detection_and_correction(df, value_col):
    if value_col not in df.columns:
        raise ValueError(f"Column '{value_col}' not found in the DataFrame")
    
    # Step 2: Identify outliers using IQR method
    Q1 = df[value_col].quantile(0.25)
    Q3 = df[value_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df['is_outlier'] = (df[value_col] < lower_bound) | (df[value_col] > upper_bound)
    
    # Step 3: Replace outliers with the median value
    median_value = df[value_col].median()
    df.loc[df['is_outlier'], value_col] = median_value
    
    # Step 4: Return cleaned df and outlier count
    outlier_count = df['is_outlier'].sum()
    return df.drop(columns=['is_outlier']), outlier_count

def scale_columns(df, columns_to_scale, scale_type):
    df_copy = df.copy()
    
    if scale_type == 'minmax':
        scaler = MinMaxScaler()
        df_copy[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    elif scale_type == 'standard':
        scaler = StandardScaler()
        df_copy[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    else:
        raise ValueError("Scale type must be either 'minmax' or 'standard'")
    
    return df_copy

def compute_top_value_frequencies(df, col_threshold):
    freq_dict = {}
    
    # Identify columns with more than `col_threshold` unique values
    for col in df.columns:
        if df[col].nunique() > col_threshold:
            # Compute frequency of top 5 most common values
            top_values = df[col].value_counts().head(5)
            
            # Normalize the frequencies
            normalized_freq = top_values / top_values.sum()
            
            freq_dict[col] = normalized_freq.to_dict()

    return freq_dict

def categorical_interaction_means(df, categorical_cols, numerical_col):
    # Step 1: Ensure the specified columns exist and are of correct types
    for col in categorical_cols:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not categorical")

    if numerical_col not in df.columns or not pd.api.types.is_numeric_dtype(df[numerical_col]):
        raise ValueError(f"Column {numerical_col} is not present or not numerical")
    
    # Step 2: Compute mean values for each combination of categorical columns
    grouped_means = df.groupby(categorical_cols)[numerical_col].mean().reset_index()
    grouped_means[f"{numerical_col}_mean"] = grouped_means[numerical_col]
    grouped_means.drop(columns=[numerical_col], inplace=True)
    
    # Step 3: Apply Min-Max normalization to the computed mean values
    scaler = MinMaxScaler()
    grouped_means[f"{numerical_col}_mean_normalized"] = scaler.fit_transform(grouped_means[[f"{numerical_col}_mean"]])
    
    return grouped_means
```

File: missing_values.py:

```Python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from collections import Counter
from sklearn.preprocessing import StandardScaler

def encode_and_merge_categorical(df, categorical_threshold):
    categorical_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() / len(df) > categorical_threshold]
    
    one_hot_encoded_cols = pd.get_dummies(df[categorical_cols])
    
    df = df.drop(columns=categorical_cols)
    df = pd.concat([df, one_hot_encoded_cols], axis=1)
    
    return df

def bin_dataframes_columns(df, bin_edges):
    for col in bin_edges.keys():
        if col not in df.columns:
            raise ValueError(f"The column '{col}' specified in bin edges is not present in the dataframe")
        
        bins, labels = bin_edges[col]
        df[f'{col}_bin'] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
    
    return df

def detect_rolling_anomalies(df, date_col, value_col):
    # Validate columns
    missing_columns = [col for col in [date_col, value_col] if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    # Parse dates and sort
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(by=date_col)

    # Rolling calculations
    df['rolling_mean'] = df[value_col].rolling(window=7).mean()
    df['rolling_std'] = df[value_col].rolling(window=7).std()

    # Detect anomalies
    df['anomaly'] = (df[value_col] > (df['rolling_mean'] + 3 * df['rolling_std'])) | (df[value_col] < (df['rolling_mean'] - 3 * df['rolling_std']))

    return df

def stratified_sampling(df, sampled_col, stratify_col, train_size):
    if sampled_col not in df.columns or not pd.api.types.is_numeric_dtype(df[sampled_col]):
        raise ValueError(f"Column '{sampled_col}' not found or is not numeric in DataFrame")
    if stratify_col not in df.columns or not pd.api.types.is_categorical_dtype(df[stratify_col]):
        raise ValueError(f"Column '{stratify_col}' not found or is not categorical in DataFrame")
    if not (0 < train_size < 1):
        raise ValueError("train_size must be between 0 and 1")

    df = df.copy()
    
    train_df, test_df = train_test_split(df, train_size=train_size, stratify=df[stratify_col], random_state=42)
    
    return train_df, test_df

def feature_selection_and_model_training(df, feature_cols, target_col):
    for col in feature_cols + [target_col]:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not numerical")

    X = df[feature_cols]
    y = df[target_col]

    model = LinearRegression()
    rfecv = RFECV(estimator=model, step=1, cv=5, scoring='neg_mean_squared_error')

    rfecv.fit(X, y)

    selected_features = [feature for feature, support in zip(feature_cols, rfecv.support_) if support]

    model.fit(X[selected_features], y)

    return model, selected_features

def filter_outliers_iqr(df, threshold):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].mask((df[col] < lower_bound) | (df[col] > upper_bound), np.nan)
    
    df = df[df.isnull().mean(axis=1) <= threshold]
    
    return df

def rolling_avg_multi_patterns(df, col_patterns):
    matching_cols = [col for col in df.columns if any(p in col for p in col_patterns)]
    
    if not matching_cols:
        raise ValueError("No matching columns found")
    
    window_sizes = [3, 5, 10]
    for col in matching_cols:
        for window in window_sizes:
            df[f'{col}_rolling_{window}'] = df[col].rolling(window=window).mean()
    
    return df

def drop_and_impute_missing_values(df, missing_threshold):
    missing_percent = df.isnull().mean()
    cols_to_drop = missing_percent[missing_percent > missing_threshold].index
    df = df.drop(columns=cols_to_drop)
    
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    cat_cols = df.select_dtypes(include=['object']).columns
    
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)
        
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

def filter_aggregate_impute(df, min_date, max_date, date_col):
    # Step 1: Filter rows based on the date range
    df_filtered = df[(df[date_col] >= min_date) & (df[date_col] <= max_date)]
    
    # Step 2: Group by 'year' and 'month'
    df_filtered[date_col] = pd.to_datetime(df_filtered[date_col])
    df_filtered['year'] = df_filtered[date_col].dt.year
    df_filtered['month'] = df_filtered[date_col].dt.month
    
    grouped = df_filtered.groupby(['year', 'month']).mean()
    
    # Step 3: Impute missing numeric values with group average
    grouped_imputed = grouped.apply(lambda x: x.fillna(x.mean()))
    
    return grouped_imputed.reset_index()

def calculate_token_frequencies(df, text_column):
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame")
    if not pd.api.types.is_string_dtype(df[text_column]):
        raise ValueError(f"Column '{text_column}' must be of string type")

    df = df.copy()
    token_counts = []
    
    for text in df[text_column]:
        tokens = re.findall(r'\b\w+\b', text.lower())
        filtered_tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        token_count = Counter(filtered_tokens)
        token_counts.append(token_count)
    
    df[f'{text_column}_token_freq'] = token_counts

    return df

def transform_and_standardize(df, factor):
    # Identify highly skewed columns
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    skewed_cols = [col for col in numeric_cols if abs(df[col].skew()) > 1]
    
    # Apply log transformation to reduce skewness
    for col in skewed_cols:
        df[col + '_log'] = np.log1p(df[col])
    
    # Standardize all numerical columns
    all_cols = df.select_dtypes(include='number').columns
    scaler = StandardScaler()
    df[all_cols] = scaler.fit_transform(df[all_cols])
    
    # Calculate skewness of transformed columns
    skewness_dict = {col: df[col].skew() for col in df.columns}
    
    return df, skewness_dict
```

File: data_splitting.py:

```Python
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

def group_proportion_sums(df, group_col, operation_col):
    # Step 1: Validate columns
    if group_col not in df.columns or not pd.api.types.is_categorical_dtype(df[group_col]):
        raise ValueError(f"Column {group_col} is not present or not of categorical type")
    if operation_col not in df.columns or not pd.api.types.is_numeric_dtype(df[operation_col]):
        raise ValueError(f"Column {operation_col} is not present or not of numeric type")

    # Step 2: Group by, Sum
    group_sums = df.groupby(group_col)[operation_col].sum()
    
    # Step 3: Normalize sums to proportions
    total_sum = group_sums.sum()
    group_proportions = group_sums / total_sum

    # Step 4: Return result
    return group_proportions.to_dict()

def winsorize_column(df, column, lower_limit, upper_limit):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' is not found in the DataFrame")

    original_na = df[column].isna()
    df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    
    df[column] = df[column].where(~original_na, np.nan)

    return df

def string_categorization_and_target_scaling(df, string_cols, categories, target_col):
    for col in string_cols:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not in the dataframe.")
        
        for category in categories:
            df[f"{col}_{category}"] = df[col].apply(lambda x: 1 if category in str(x) else 0)
        
        df.drop(columns=[col], inplace=True)
    
    for new_col in [f"{col}_{category}" for col in string_cols for category in categories]:
        df[new_col] = df[new_col] / df[new_col].sum()
    
    df[f"{target_col}_scaled"] = (df[target_col] - df[target_col].min()) / (df[target_col].max() - df[target_col].min())
    
    return df

def discretize_and_correlate(df, target_col, feature_cols, bins):
    df['binned_target'] = pd.cut(df[target_col], bins=bins, labels=False)
    encoded_target = pd.get_dummies(df['binned_target'], prefix='target_bin')

    correlation_data = {'feature': [], 'correlation': []}
    for feature in feature_cols:
        for col in encoded_target.columns:
            corr_value = np.corrcoef(df[feature].values, encoded_target[col].values)[0, 1]
            correlation_data['feature'].append(f'{feature}_vs_{col}')
            correlation_data['correlation'].append(corr_value)
    
    correlation_df = pd.DataFrame(correlation_data)
    return correlation_df

def extract_datetime_components(df, datetime_col):
    if datetime_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
        raise ValueError(f"Column '{datetime_col}' not found or is not a datetime column in DataFrame")
    
    df = df.copy()
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['day'] = df[datetime_col].dt.day
    df['hour'] = df[datetime_col].dt.hour
    df['minute'] = df[datetime_col].dt.minute

    for col in ['year', 'month', 'day', 'hour', 'minute']:
        if df[col].isna().any():
            raise RuntimeError(f"Generated column '{col}' contains NaN values which is unexpected")
    
    return df

def encode_and_normalize(df, label_col):
    # Identify categorical columns
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    
    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    
    # Normalize numerical columns
    num_cols = df_encoded.select_dtypes(include='number').columns.tolist()
    num_cols.remove(label_col)
    scaler = MinMaxScaler()
    df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])
    
    # Calculate correlation matrix
    corr_matrix = df_encoded.corr()[[label_col]].drop(label_col)
    
    return df_encoded, corr_matrix

def groupby_aggregate_and_diff(df, groupby_cols, agg_cols, diff_cols):
    # Group by and aggregate
    grouped_df = df.groupby(groupby_cols).agg(agg_cols).reset_index()
    
    # Compute differences for specified columns
    for col in diff_cols:
        grouped_df[f'{col}_diff'] = grouped_df.groupby(groupby_cols)[col].diff()
    
    return grouped_df

def resample_and_interpolate(df, datetime_col, freq):
    if datetime_col not in df.columns:
        raise ValueError(f"Datetime column '{datetime_col}' is not in the dataframe.")
    if not pd.api.types.is_datetime64_any_dtype(df[datetime_col]):
        raise ValueError(f"Column '{datetime_col}' must be of datetime type.")
    
    df = df.set_index(datetime_col)
    resampled_df = df.resample(freq).mean()
    interpolated_df = resampled_df.interpolate(method='time')
    
    return interpolated_df.reset_index()

def category_based_scaling(df, category_col, num_cols, scaler_type):
    # Step 1: Validate columns
    if category_col not in df.columns:
        raise ValueError("Category column not found in DataFrame")
    
    for col in num_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Numeric column {col} not found or not numeric in DataFrame")
    
    # Step 2: Validate scaler type
    if scaler_type not in ["standard", "minmax"]:
        raise ValueError("Invalid scaler type. Use 'standard' or 'minmax'")

    # Step 3: Apply scaling per category
    df = df.copy()
    metadata = []

    for category in df[category_col].unique():
        category_mask = df[category_col] == category
        if scaler_type == "standard":
            scaler = StandardScaler()
        else:
            scaler = MinMaxScaler()
        
        scaled_values = scaler.fit_transform(df.loc[category_mask, num_cols])
        df.loc[category_mask, num_cols] = scaled_values

        # Step 4: Store metadata
        metadata.append({
            "category": category,
            "scaler": scaler_type,
            "scaled_columns": num_cols
        })
    
    return df, pd.DataFrame(metadata)

def merge_and_concatenate(df, cols_to_merge, id_col):
    # Validate columns
    missing_columns = [col for col in [id_col] + cols_to_merge if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    # Melt the DataFrame
    melted_df = pd.melt(df, id_vars=[id_col], value_vars=cols_to_merge, var_name='original_col', value_name='value')

    # Pivot and concatenate values
    pivoted_df = melted_df.pivot_table(index=id_col, columns='original_col', values='value', aggfunc=lambda x: ' '.join(str(v) for v in x if pd.notna(v)))
    pivoted_df.columns = [f"{col}_merged" for col in pivoted_df.columns]
    
    # Reset index to have a flat DataFrame
    result_df = pivoted_df.reset_index()
    
    return result_df

def split_and_balance_dataset(df, target_col, ratio):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    train_df, test_df = train_test_split(df, test_size=ratio, stratify=df[target_col])
    
    return train_df, test_df
```

File: feature_generation.py:

```Python
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

def scale_within_categories(df, cat_col, num_col, scale_type):
    if cat_col not in df.columns or num_col not in df.columns:
        raise ValueError(f"Columns '{cat_col}' or '{num_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")

    df = df.copy()
    scaler = None
    scaled_col_name = f"{num_col}_{scale_type}"

    if scale_type == 'min-max':
        scaler = MinMaxScaler()
    elif scale_type == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f"Unknown scaling type: {scale_type}")

    df[scaled_col_name] = df.groupby(cat_col)[num_col].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())

    return df

def aggregate_by_category(df, id_column, category_column):
    # Step 1: Group by `id_column` and `category_column`
    grouped_df = df.groupby([id_column, category_column]).agg(['count', 'sum', 'mean', 'std'])
    
    # Step 2: Flatten multi-level columns
    grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
    
    return grouped_df.reset_index()

def quantile_labels(df, target_col, quantiles):
    numerical_cols = df.select_dtypes(include='number').columns.drop(target_col)
    labels_df = pd.DataFrame()

    for col in numerical_cols:
        col_vals = df[col]
        # Step 1: Calculate specified quantiles
        quantile_vals = np.quantile(col_vals.dropna(), quantiles)
        
        for i, quantile in enumerate(quantiles):
            new_col_name = f'{col}_quantile_{quantile:.2f}'
            labels_df[new_col_name] = pd.cut(col_vals, bins=[-np.inf, quantile_vals[i], np.inf], labels=['BelowQuantile', 'AboveQuantile'], include_lowest=True).astype(str)
    
    # Step 4: Return dataframe with quantile-derived columns
    return labels_df

def apply_column_transformations(df, column_specs):
    # Step 1: Ensure columns in column_specs are present in the DataFrame
    for col in column_specs:
        if col not in df.columns:
            raise ValueError(f"Column {col} specified in column_specs is not present in the DataFrame")
    
    # Step 2: Apply transformations
    for col, transform in column_specs.items():
        if transform == 'scale':
            scaler = StandardScaler()
            df[col] = scaler.fit_transform(df[[col]])
        elif transform == 'normalize':
            scaler = MinMaxScaler()
            df[col] = scaler.fit_transform(df[[col]])
        elif callable(transform):
            df[col] = df[col].apply(transform)
        else:
            raise ValueError(f"Invalid transformation specified for column {col}")
    
    # Step 3: Ensure no NaN or inf values are generated
    if df.isnull().any().any() or np.isinf(df).any().any():
        raise ValueError("Transformation resulted in NaN or inf values")
    
    return df

def group_based_z_score(df, group_col, target_col):
    if group_col not in df.columns or target_col not in df.columns:
        raise ValueError(f"Either '{group_col}' or '{target_col}' column not found in the DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"The column '{target_col}' must be numeric")
    
    df[target_col + '_backup'] = df[target_col]
    
    df[target_col] = df.groupby(group_col)[target_col].transform(lambda x: zscore(x, ddof=1))
    
    return df

def bin_and_encode_numeric_features(df, numeric_threshold):
    numeric_cols = df.select_dtypes(include='number').columns
    new_df = df.copy()

    for col in numeric_cols:
        bins = np.linspace(0, 1, numeric_threshold + 1)
        quantile_bins = np.quantile(new_df[col], bins)
        new_df[col + '_binned'] = pd.cut(new_df[col], bins=quantile_bins, include_lowest=True)
        dummy_df = pd.get_dummies(new_df[col + '_binned'], prefix=col + '_binned')
        new_df = pd.concat([new_df, dummy_df], axis=1)
        new_df.drop(columns=[col + '_binned'], inplace=True)

    return new_df

def generate_time_series_features(df, id_col, target_col):
    df.sort_index(inplace=True)
    
    # Step 1: Generate lag features for the past 3 periods
    for lag in range(1, 4):
        df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)
    
    # Step 2: Create cumulative sum and mean features grouped by id_col
    df[f'{target_col}_cum_sum'] = df.groupby(id_col)[target_col].cumsum()
    df[f'{target_col}_cum_mean'] = df.groupby(id_col)[target_col].expanding().mean().reset_index(level=0, drop=True)

    # Step 3: Compute date-related features if index is datetime
    if isinstance(df.index, pd.DatetimeIndex):
        df['day_of_week'] = df.index.day_of_week
        df['week_of_year'] = df.index.isocalendar().week
        df['month_of_year'] = df.index.month

    return df

def normalize_grouped_statistics(df, grouping_col, value_col):
    # Check if columns exist
    if grouping_col not in df.columns or value_col not in df.columns:
        raise ValueError(f"Columns '{grouping_col}' or '{value_col}' not found in DataFrame")

    # Group by grouping_col and calculate statistics
    grouped_stats = df.groupby(grouping_col)[value_col].agg(['sum', 'mean', 'std']).reset_index()

    # Normalize the statistics
    scaler = MinMaxScaler()
    grouped_stats[['sum', 'mean', 'std']] = scaler.fit_transform(grouped_stats[['sum', 'mean', 'std']])

    return grouped_stats

def rolling_mean_normalize(df, num_col):
    if num_col not in df.columns:
        raise ValueError(f"Column '{num_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[num_col]):
        raise ValueError(f"Column '{num_col}' must be numeric")
    
    rolling_mean_col = f"{num_col}_rolling_mean"
    df[rolling_mean_col] = df[num_col].rolling(window=5).mean().fillna(method='bfill')
    
    min_val = df[rolling_mean_col].min()
    max_val = df[rolling_mean_col].max()
    
    df[rolling_mean_col] = (df[rolling_mean_col] - min_val) / (max_val - min_val)
    
    return df

def top_n_within_category(df, cat_col, num_col, n):
    if cat_col not in df.columns:
        raise ValueError(f"Column '{cat_col}' is not found in the DataFrame")
    if num_col not in df.columns or not np.issubdtype(df[num_col].dtype, np.number):
        raise ValueError(f"Column '{num_col}' is not found in the DataFrame or is not numeric")

    df['rank'] = df.groupby(cat_col)[num_col].rank(method='first', ascending=False)
    top_n_df = df[df['rank'] <= n].sort_values([cat_col, 'rank']).reset_index(drop=True)
    top_n_df = top_n_df.drop(columns=['rank'])

    return top_n_df

def encode_top_values(df, target_column, k, method):
    # Step 1: Identify the top k most frequent values
    top_values = df[target_column].value_counts().nlargest(k).index
    
    # Step 2: Create new columns for each top value
    for value in top_values:
        if method == 'binary':
            df[f'{target_column}_{value}'] = (df[target_column] == value).astype(int)
        elif method == 'count':
            df[f'{target_column}_{value}'] = df[target_column].apply(lambda x: 1 if x == value else 0)
        else:
            raise ValueError("Method should be either 'binary' or 'count'")
    
    return df
```

File: category_processing.py:

```Python
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import entropy
from sklearn.preprocessing import StandardScaler

def handle_outliers(df, columns, strategy):
    if strategy not in ['remove', 'cap']:
        raise ValueError("Strategy must be 'remove' or 'cap'")
    
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric type")
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        if strategy == 'remove':
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        elif strategy == 'cap':
            df[col] = df[col].clip(lower_bound, upper_bound)
    
    return df

def create_pivot_table(df, cols_to_pivot, agg_func):
    # Validate presence of cols_to_pivot and ensure they are categorical
    for col in cols_to_pivot:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not categorical in DataFrame")
    
    # Create pivot table
    pivot_df = df.pivot_table(index=cols_to_pivot[0], columns=cols_to_pivot[1:], aggfunc=agg_func, margins=True)
    
    # Convert pivot table to DataFrame
    pivot_df = pivot_df.reset_index()
    
    return pivot_df

def sensor_data_anomaly_detection(sensor_data, id_col, timestamp_col, measurement_col):
    # Sort the data based on id_col and timestamp_col
    sensor_data = sensor_data.sort_values(by=[id_col, timestamp_col])
    
    # Interpolate missing values in measurement_col
    sensor_data[measurement_col] = sensor_data.groupby(id_col)[measurement_col].transform(lambda x: x.interpolate(method='linear'))
    
    # Calculate the rate of change of measurement_col over time
    sensor_data['rate_of_change'] = sensor_data.groupby(id_col)[measurement_col].diff() / sensor_data.groupby(id_col)[timestamp_col].diff().dt.total_seconds()
    
    # Identify periods of anomalous behavior using simple moving average
    sensor_data['moving_avg'] = sensor_data.groupby(id_col)[measurement_col].transform(lambda x: x.rolling(window=5).mean())
    sensor_data['anomaly_flag'] = ((sensor_data[measurement_col] - sensor_data['moving_avg']).abs() > 2 * sensor_data.groupby(id_col)[measurement_col].transform('std')).astype(int)
    
    return sensor_data.drop(columns=['moving_avg'])

def numerical_clustering(df, numerical_cols, categories):
    for col in numerical_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must exist and be of numeric type")
            
    df = df.copy()
    for col in numerical_cols:
        kmeans = KMeans(n_clusters=categories, random_state=42)
        df[f'{col}_cluster'] = kmeans.fit_predict(df[[col]])
        
    return df

def group_aggregate_and_merge(df, group_columns, agg_dict):
    grouped_df = df.groupby(group_columns).agg(agg_dict).reset_index()
    
    group_size = grouped_df.groupby(group_columns).size().reset_index(name='group_size')
    total_size = df.shape[0]
    group_size['group_percent'] = group_size['group_size'] / total_size * 100
    
    grouped_df = pd.merge(grouped_df, group_size, on=group_columns)
    df = pd.merge(df, grouped_df, on=group_columns, how='left')
    
    return df

def compute_tf_idf(df, text_col, word_list):
    df = df.copy()

    # Step 1: Tokenize the text
    vectorizer = TfidfVectorizer(vocabulary=word_list)
    tf_idf_matrix = vectorizer.fit_transform(df[text_col])

    # Step 2: Extract TF-IDF values
    tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Step 3: Append TF-IDF values to the original DataFrame
    df = pd.concat([df, tf_idf_df], axis=1)

    return df

def identify_low_entropy_columns(df, categorical_cols, target_col, threshold=0.5):
    # Step 1: Ensure columns are present and categorical
    for col in categorical_cols:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not categorical")

    if target_col not in df.columns or not pd.api.types.is_categorical_dtype(df[target_col]):
        raise ValueError(f"Target column {target_col} is not present or not categorical")

    low_entropy_columns = []
    
    # Step 2: Calculate entropy for each categorical column with respect to the target column
    for col in categorical_cols:
        freq_df = pd.crosstab(df[col], df[target_col])
        cond_entropy = entropy(freq_df, axis=1, base=2).mean()
        
        # Step 3: Identify columns with entropy value below threshold
        if cond_entropy < threshold:
            low_entropy_columns.append(col)
    
    return low_entropy_columns

def equal_width_binning(df, col_to_bin, n_bins):
    df = df.copy()
    
    bin_labels = range(n_bins)
    df[col_to_bin + '_binned'] = pd.cut(df[col_to_bin], bins=n_bins, labels=bin_labels)
    
    return df

def hierarchical_aggregation_and_scaling(df, cat_col, agg_dict):
    agg_df = df.groupby(cat_col).agg(agg_dict)
    
    hierarchical_index = agg_df.index
    agg_df = agg_df.reset_index()
    agg_df.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in agg_df.columns.values]
    
    scaler = StandardScaler()
    agg_df_scaled = scaler.fit_transform(agg_df.iloc[:, 1:])
    
    standardized_df = pd.DataFrame(agg_df_scaled, columns=agg_df.columns[1:], index=hierarchical_index)
    
    return standardized_df

def filter_and_summarize_datetime_columns(df, datetime_cols, operation_type, threshold):
    # Step 1: Validate datetime columns
    for col in datetime_cols:
        if col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of datetime type")
    
    if operation_type not in ["count", "range"]:
        raise ValueError("Operation type must be either 'count' or 'range'")
    
    summary_report = []

    if operation_type == "count":
        # Step 2: Count occurrences and apply threshold
        for col in datetime_cols:
            counts = df[col].value_counts()
            df = df[df[col].apply(lambda x: counts[x] >= threshold[col])]
            summary_report.append({"column": col, "operation": "count", "remaining_rows": len(df)})
    
    elif operation_type == "range":
        # Step 3: Filter based on datetime range
        for col in datetime_cols:
            start, end = threshold[col]
            df = df[(df[col] >= start) & (df[col] <= end)]
            summary_report.append({"column": col, "operation": "range", "remaining_rows": len(df)})

    # Step 4: Return filtered DataFrame and summary report
    return df, pd.DataFrame(summary_report)

def categorical_transform(df, categories, target):
    for cat in categories:
        freq_col = f"{cat}_freq"
        prop_col = f"{cat}_prop"
        target_enc_col = f"{cat}_target_enc"
        
        freq_counts = df[cat].value_counts()
        total_counts = freq_counts.sum()
        
        df[freq_col] = df[cat].map(freq_counts)
        df[prop_col] = df[freq_col] / total_counts
        
        target_means = df.groupby(cat)[target].mean()
        df[target_enc_col] = df[cat].map(target_means)
        
    return df
```

File: categorical_handling.py:

```Python
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import combinations

def add_normalized_interaction_term(df, col1, col2, new_col_name):
    # Step 1: Validate columns
    for col in [col1, col2]:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
    
    # Step 2: Calculate correlation (not necessary for other steps but can be useful context)
    correlation = df[[col1, col2]].corr().iloc[0, 1]
    print(f"Correlation between {col1} and {col2}: {correlation}")
    
    # Step 3: Create interaction term
    df[new_col_name] = df[col1] * df[col2]
    
    # Step 4: Normalize interaction term
    scaler = MinMaxScaler()
    df[new_col_name] = scaler.fit_transform(df[[new_col_name]])
    
    return df

def rolling_window_diff(df, sequence_col, window_size):
    df = df.copy()

    df[f'{sequence_col}_rolling_mean'] = df[sequence_col].rolling(window=window_size).mean()
    df[f'{sequence_col}_rolling_std'] = df[sequence_col].rolling(window=window_size).std()
    df[f'{sequence_col}_rolling_min'] = df[sequence_col].rolling(window=window_size).min()
    df[f'{sequence_col}_rolling_max'] = df[sequence_col].rolling(window=window_size).max()

    # Impute missing rolling statistics
    for col in [f'{sequence_col}_rolling_mean', f'{sequence_col}_rolling_std', f'{sequence_col}_rolling_min', f'{sequence_col}_rolling_max']:
        df[col].fillna(df[col].mean(), inplace=True)

    # Differencing to remove trend
    df[f'{sequence_col}_diff'] = df[sequence_col].diff().fillna(0)
    
    return df

def add_hierarchical_cumsum_rank(df, hierarchy_cols, value_col):
    for col in hierarchy_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    if value_col not in df.columns:
        raise ValueError(f"Column '{value_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Column '{value_col}' must be numeric")

    grouped = df.groupby(hierarchy_cols)
    
    df[f'{value_col}_cumsum'] = grouped[value_col].cumsum()
    df[f'{value_col}_rank'] = grouped[value_col].rank()
    
    return df

def stratified_train_test_split(df, train_size, target_col):
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not present in DataFrame")
        
    if not (0 < train_size < 1):
        raise ValueError("Train size must be a float between 0 and 1")
    
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=train_size, stratify=y)
    
    train_df = X_train.copy()
    train_df[target_col] = y_train
    
    test_df = X_test.copy()
    test_df[target_col] = y_test
    
    return train_df, test_df

def augment_stock_features(stock_data, feature_list, window):
    for feature in feature_list:
        rolling_mean = stock_data[feature].rolling(window=window).mean()
        rolling_std = stock_data[feature].rolling(window=window).std()

        stock_data[f"{feature}_rolling_mean"] = rolling_mean
        stock_data[f"{feature}_rolling_std"] = rolling_std

        # Normalize new columns
        stock_data[f"{feature}_rolling_mean"] = (rolling_mean - rolling_mean.min()) / (rolling_mean.max() - rolling_mean.min())
        stock_data[f"{feature}_rolling_std"] = (rolling_std - rolling_std.min()) / (rolling_std.max() - rolling_std.min())
    
    return stock_data

def pairwise_multiplication(df, target_col):
    numerical_cols = df.select_dtypes(include='number').columns.drop(target_col)
    new_df = df[[target_col]].copy()

    # Step 1: Create all possible pairs of numerical columns
    for col1, col2 in combinations(numerical_cols, 2):
        # Step 2: Add new derived column as product of the two columns
        new_col_name = f'{col1}_x_{col2}'
        new_df[new_col_name] = df[col1] * df[col2]
        
        # Step 3: Replace negative products with zero
        new_df[new_col_name] = new_df[new_col_name].apply(lambda x: x if x > 0 else 0)

    return new_df

def mode_of_sparse_columns(df, id_col, new_col):
    if id_col not in df.columns:
        raise ValueError(f"Column '{id_col}' not found in DataFrame")
    
    unique_counts = {col: df[col].nunique() for col in df.columns if df[col].dtype == 'O'}
    
    sparse_column = min(unique_counts, key=unique_counts.get)
    
    df[new_col] = df.groupby(id_col)[sparse_column].transform(lambda x: x.mode()[0] if not x.mode().empty else None)
    
    return df

def apply_rolling_statistics(df, window_size, min_periods):
    if df.empty:
        raise ValueError("Input dataframe is empty")
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    datetime_columns = df.select_dtypes(include=['datetime']).columns.tolist()
    if not numeric_columns and not datetime_columns:
        raise ValueError("Dataframe must have at least one numeric or datetime column")

    df = df.copy()

    # Compute rolling mean for numeric columns
    for col in numeric_columns:
        df[f'{col}_rolling_mean_{window_size}'] = df[col].rolling(window=window_size, min_periods=min_periods).mean()

    # Compute rolling count of non-null values for datetime columns
    for col in datetime_columns:
        df[f'{col}_rolling_count_{window_size}'] = df[col].rolling(window=window_size, min_periods=min_periods).count()

    return df

def mark_rare_categories(df, categorical_columns, rare_threshold):
    if not all(col in df.columns for col in categorical_columns):
        raise ValueError("One or more categorical columns not found in DataFrame")
    
    for col in categorical_columns:
        category_counts = df[col].value_counts(normalize=True)
        rare_categories = category_counts[category_counts < rare_threshold].index
        
        df[col] = df[col].apply(lambda x: 'Rare' if x in rare_categories else x)
    
    return df

def create_aggregate_window_features(df, id_col, date_col, feature_cols):
    for col in [id_col, date_col] + feature_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        raise ValueError(f"Column '{date_col}' must be of datetime type")
    
    df = df.sort_values(by=[id_col, date_col])
    
    for col in feature_cols:
        df[f'{col}_trailing_sum'] = df.groupby(id_col)[col].transform(lambda x: x.rolling(window=3, min_periods=1).sum())
        df[f'{col}_forward_sum'] = df.groupby(id_col)[col].transform(lambda x: x.rolling(window=3, min_periods=1).sum().shift(-2))
    
    return df

def weighted_encode(df, target_col, encoded_cols):
    # Step 1: Compute correlations with the target column
    correlations = {}
    for col in encoded_cols:
        correlations[col] = df[col].corr(df[target_col])
    
    # Step 2: Identify column with highest absolute correlation
    best_col = max(correlations, key=lambda col: abs(correlations[col]))
    best_correlation = correlations[best_col]
    
    # Step 3: Implement weighted encoding
    freq = df[best_col].value_counts(normalize=True)
    encoding_map = {k: v * best_correlation for k, v in freq.items()}
    
    df[best_col] = df[best_col].map(encoding_map)
    
    return df
```

File: log_processor.py:

```Python
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
import re

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min((k - 1), (r - 1)))

def flag_high_rolling_sums(df, col, threshold):
    rows = df[df[col] > threshold]
    window_size = len(rows)
    
    rolling_sums = rows[col].rolling(window=window_size).sum().reset_index(drop=True)
    
    df['rolling_sum'] = pd.Series(index=rows.index, data=rolling_sums)
    df['rolling_sum_flag'] = df['rolling_sum'] > threshold
    
    return df

def compute_rolling_window_avg(df, id_col, timestamp_col, target_col, window_size):
    if id_col not in df.columns or timestamp_col not in df.columns or target_col not in df.columns:
        raise ValueError(f"Columns {id_col}, {timestamp_col}, and {target_col} must be present in DataFrame")

    df = df.copy()
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    df.sort_values(by=[id_col, timestamp_col], inplace=True)
    
    grouped = df.groupby(id_col)
    df[f"rolling_avg_{target_col}"] = grouped[target_col].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    
    return df

def summarize_by_intervals(df, datetime_column, n_splits):
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    df = df.sort_values(by=datetime_column)
    
    # Step 1: Discretize the datetime column
    df['interval'] = pd.qcut(df[datetime_column], q=n_splits, duplicates='drop')
    
    # Step 2: Calculate sum of all numerical columns within each interval
    interval_sums = df.groupby('interval').sum(numeric_only=True)
    interval_sizes = df.groupby('interval').size()
    
    # Step 3: Create new dataframe interval_df
    interval_df = pd.DataFrame(interval_sums)
    interval_df['Interval_Size'] = interval_sizes.values

    return interval_df

def normalize_and_threshold_groups(df, category_col, value_col, threshold):
    # Step 1: Normalize 'value_col' within each group in 'category_col'
    df['normalized_value'] = df.groupby(category_col)[value_col].transform(lambda x: (x - x.mean()) / x.std())
    
    # Step 2: Identify groups with mean of normalized 'value_col' exceeding 'threshold'
    group_means = df.groupby(category_col)['normalized_value'].mean().reset_index(name='mean_normalized_value')
    high_value_groups = group_means[group_means['mean_normalized_value'] > threshold]
    
    # Step 3: Generate summary statistics for identified groups
    summary_stats = high_value_groups.describe()
    
    return df, summary_stats

def linear_regression_prediction(df, col1, col2):
    if col1 not in df.columns:
        raise ValueError(f"Column '{col1}' not found in DataFrame")
    if col2 not in df.columns:
        raise ValueError(f"Column '{col2}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[col1]):
        raise ValueError(f"Column '{col1}' must be numeric")
    if not pd.api.types.is_numeric_dtype(df[col2]):
        raise ValueError(f"Column '{col2}' must be numeric")
    
    correlation = df[[col1, col2]].corr().iloc[0, 1]
    
    X = df[[col1]].values.reshape(-1, 1)
    y = df[col2].values
    model = LinearRegression()
    model.fit(X, y)
    df[f'predicted_{col2}'] = model.predict(X)
 
    return df, correlation

def bin_numerical_columns(df, numerical_cols, bin_edges):
    for col in numerical_cols:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric")
        if col not in bin_edges or not isinstance(bin_edges[col], list):
            raise ValueError(f"Invalid bin edges for column '{col}'")
    
    for col in numerical_cols:
        bins = bin_edges[col]
        labels = [f'bin_{i}' for i in range(len(bins) - 1)]
        df[f'binned_{col}'] = pd.cut(df[col], bins=bins, labels=labels)

    return df

def apply_operations_from_log(df, operation_log):
    operations = operation_log.split(';')
    
    for op in operations:
        action, params = op.split(':')
        
        if action == 'fillna':
            col, method = params.split(',')
            if method == 'mean':
                df[col].fillna(df[col].mean(), inplace=True)
            elif method == 'median':
                df[col].fillna(df[col].median(), inplace=True)
            else:
                raise ValueError(f"Unknown fillna method: {method}")
        elif action == 'dropna':
            if ',' in params:
                col, axis = params.split(',')
                df.dropna(subset=[col], axis=int(axis), inplace=True)
            else:
                col = params
                df.dropna(subset=[col], inplace=True)
        else:
            raise ValueError(f"Unknown operation: {action}")
    
    return df

def grouped_cumulative_features(df, group_by_column, target_column, k=3):
    df = df.sort_values(by=[group_by_column, target_column]).reset_index(drop=True)
    
    grouped = df.groupby(group_by_column)
    
    df[f"{target_column}_cumsum"] = grouped[target_column].cumsum()
    df[f"{target_column}_expanding_mean"] = grouped[target_column].expanding().mean().reset_index(level=0, drop=True)
    df[f"{target_column}_expanding_std"] = grouped[target_column].expanding().std().reset_index(level=0, drop=True)
    
    top_k_indices = grouped[f"{target_column}_cumsum"].nlargest(k).index.get_level_values(1)
    df[f"{target_column}_top_k"] = 0
    df.loc[top_k_indices, f"{target_column}_top_k"] = 1
    
    return df

def calculate_ewma_ewmsd(df, window_size, numeric_cols):
    for col in numeric_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not numeric in DataFrame")
    
    df = df.copy()
    
    for col in numeric_cols:
        df[f"{col}_ewma"] = df[col].ewm(span=window_size, adjust=False).mean()
        df[f"{col}_ewmsd"] = df[col].ewm(span=window_size, adjust=False).std()
    
    return df

def compute_moving_averages(df, col_name, window_size):
    if col_name not in df.columns or not pd.api.types.is_numeric_dtype(df[col_name]):
        raise ValueError(f"Column {col_name} is not numeric or not in dataframe")
    
    df['rolling_mean'] = df[col_name].rolling(window=window_size).mean()
    
    df['ewma'] = df[col_name].ewm(span=window_size).mean()
    
    return df
```

File: composite_feature_scores.py:

```Python
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def compute_composite_score(df, features, diff_window):
    # Validate columns
    missing_columns = [col for col in features if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are not in the DataFrame: {missing_columns}")

    diff_cols = []
    for feature in features:
        diff_col_name = f"{feature}_diff_{diff_window}"
        df[diff_col_name] = df[feature].diff(periods=diff_window)
        diff_cols.append(diff_col_name)

    # Standardize differences
    standardized_diffs = (df[diff_cols] - df[diff_cols].mean()) / df[diff_cols].std()
    
    # Composite score
    df['composite_feature_score'] = standardized_diffs.sum(axis=1)

    return df

def compute_column_ranges(df, range_cols):
    # Step 1: Ensure columns are present and numerical
    for col in range_cols:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not numerical")
    
    # Step 2: Compute the range for each column
    ranges = {}
    for col in range_cols:
        ranges[col] = df[col].max() - df[col].min()
    
    # Step 3: Calculate the average range
    avg_range = sum(ranges.values()) / len(ranges)
    
    # Step 4: Store average range in new column
    df['average_range'] = avg_range
    
    return df

def resample_and_count_events(df, time_col, event_id_col, count_col):
    # Step 1: Validate 'time_col' and 'event_id_col'
    if time_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[time_col]):
        raise ValueError(f"Column '{time_col}' must be present and of datetime type in DataFrame")
    if event_id_col not in df.columns:
        raise ValueError(f"Column '{event_id_col}' must be present in DataFrame")
    
    # Step 2: Set index to 'time_col' and group by 'event_id_col'
    df.set_index(time_col, inplace=True)
    grouped_df = df.groupby(event_id_col).resample('D').size().reset_index(name=count_col)
    
    return grouped_df

def filter_with_conditions(df, condition_list):
    df = df.copy()
    
    for condition in condition_list:
        col, op, value = condition.split(':')
        
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        
        if op == 'eq':
            df = df[df[col] == value]
        elif op == 'neq':
            df = df[df[col] != value]
        elif op == 'gt':
            df = df[df[col] > float(value)]
        elif op == 'lt':
            df = df[df[col] < float(value)]
        elif op == 'ge':
            df = df[df[col] >= float(value)]
        elif op == 'le':
            df = df[df[col] <= float(value)]
        else:
            raise ValueError(f"Unknown operation: {op}")
    
    return df

def impute_with_rolling_mean(df, fill_columns, rolling_window, min_periods):
    rolling_means = df[fill_columns].rolling(window=rolling_window, min_periods=min_periods).mean()
    df[fill_columns] = df[fill_columns].fillna(rolling_means)
    
    for col in fill_columns:
        df[f'{col}_rolling_mean'] = rolling_means[col]
        df[f'{col}_rate_of_change'] = rolling_means[col].diff()
    
    return df

def vectorize_text_columns(df, text_columns, tfidf_max_features, ngrams):
    # Step 1: Validate columns
    for col in text_columns:
        if col not in df.columns or not pd.api.types.is_string_dtype(df[col]):
            raise ValueError(f"Column {col} is not present or not of string type")
    
    # Step 2: Vectorize using TF-IDF
    for col in text_columns:
        vectorizer = TfidfVectorizer(max_features=tfidf_max_features, ngram_range=ngrams)
        tfidf_matrix = vectorizer.fit_transform(df[col].values.astype('U'))
        
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'{col}_tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
        df = pd.concat([df, tfidf_df], axis=1)
    
    return df

def add_cluster_labels(df, num_clusters, clustering_columns):
    if not isinstance(num_clusters, int) or num_clusters <= 0:
        raise ValueError("'num_clusters' must be a positive integer")

    kmeans = KMeans(n_clusters=num_clusters)
    cluster_labels = kmeans.fit_predict(df[clustering_columns])
    
    df['cluster_label'] = cluster_labels
    
    return df

def replace_with_frequency(df, cat_cols):
    df = df.copy()
    
    for col in cat_cols:
        freq = df[col].value_counts().to_dict()
        df[col + '_freq'] = df[col].map(freq)
    
    return df

def compute_cov_and_corr(df, col1, col2):
    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError("One or both columns do not exist in the DataFrame")

    if not pd.api.types.is_numeric_dtype(df[col1]) or not pd.api.types.is_numeric_dtype(df[col2]):
        raise ValueError("Both columns must be of numeric type")

    covariance = df[col1].cov(df[col2])
    correlation = df[col1].corr(df[col2])

    return covariance, correlation

def encode_target_column(df, target_col):
    if target_col not in df.columns or not pd.api.types.is_string_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' not found or is not a string column in DataFrame")
    
    value_counts = df[target_col].value_counts()
    encoding_map = {value: idx for idx, value in enumerate(value_counts.index)}
    
    df = df.copy()
    df[f'{target_col}_encoded'] = df[target_col].map(encoding_map)
    
    if df[f'{target_col}_encoded'].isna().any():
        raise RuntimeError(f"Failed to encode '{target_col}' properly; NaN values found in encoded column")
    
    return df

def calculate_yoy_growth(df, date_col, target_col):
    if date_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        raise ValueError(f"Column {date_col} is not datetime or not in dataframe")
    
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column {target_col} is not numeric or not in dataframe")
    
    df.set_index(date_col, inplace=True)
    monthly_mean = df[target_col].resample('M').mean()
    
    yoy_growth = monthly_mean.pct_change(periods=12)
    
    result_df = pd.DataFrame({
        'monthly_mean': monthly_mean,
        'yoy_growth': yoy_growth
    }).reset_index()
    
    df.reset_index(drop=True, inplace=True)
    
    return result_df
```

File: interaction_features.py:

```Python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer

def filter_and_clean_cats(df, col_name, threshold, min_pct):
    # Filter rows by threshold
    filtered_df = df[df[col_name] > threshold]
    
    # Identify categorical columns
    cat_columns = filtered_df.select_dtypes(include=['object', 'category']).columns
    
    # Drop rare categories
    for col in cat_columns:
        value_counts = filtered_df[col].value_counts(normalize=True)
        rare_categories = value_counts[value_counts < min_pct].index
        filtered_df = filtered_df[~filtered_df[col].isin(rare_categories)]
    
    return filtered_df

def bin_features_and_replace_labels(df, feature_col, label_col):
    # Step 1: Ensure required columns are present
    if feature_col not in df.columns or label_col not in df.columns:
        raise ValueError(f"Columns '{feature_col}' and/or '{label_col}' not found in DataFrame")
    
    df = df.copy()

    # Step 2: Bin feature_col into quartiles
    df[f"{feature_col}_bin"] = pd.qcut(df[feature_col], 4, labels=False)

    # Step 3: Calculate mean of label_col for each bin
    bin_means = df.groupby(f"{feature_col}_bin")[label_col].mean().to_dict()

    # Step 4: Replace label_col values with bin means
    df[label_col] = df[f"{feature_col}_bin"].map(bin_means)

    return df, bin_means

def rolling_mean_difference(df, time_col, value_col, rolling_window):
    # Step 1: Sort the DataFrame based on time_col
    df = df.sort_values(by=[time_col])
    
    # Step 2: Compute rolling mean
    df['rolling_mean'] = df[value_col].rolling(rolling_window).mean()
    
    # Step 3: Calculate difference between actual value and rolling mean
    df['mean_diff'] = df[value_col] - df['rolling_mean']
    
    # Step 4: Append column indicating if difference exceeds threshold
    threshold = 2 * df[value_col].std()
    df['exceeds_threshold'] = (df['mean_diff'].abs() > threshold).astype(int)
    
    return df

def create_interaction_terms(df, col_subset):
    interaction_df = df.copy()
    
    for col1, col2 in combinations(col_subset, 2):
        interaction_term = df[col1] * df[col2]
        interaction_df[f"{col1}_{col2}_interaction"] = interaction_term
    
    scaler = StandardScaler()
    interaction_columns = [col for col in interaction_df.columns if '_interaction' in col]
    interaction_df[interaction_columns] = scaler.fit_transform(interaction_df[interaction_columns])
    
    return interaction_df

def bin_and_analyze(df, target_col, n_bins, bin_col):
    # Step 1: Ensure 'target_col' exists and is numeric
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    
    # Step 2: Bin the target variable
    df[bin_col], bins = pd.cut(df[target_col], bins=n_bins, labels=False, retbins=True)
    
    # Step 3: Compute mean and variance for each bin
    bin_stats = df.groupby(bin_col)[target_col].agg(['mean', 'var']).reset_index()
    bin_stats.columns = [bin_col, f'{target_col}_mean', f'{target_col}_variance']
    
    # Merge stats with original dataframe
    df = df.merge(bin_stats, on=bin_col, how='left')
    
    return df

def expand_categorical(df, categorical_cols, prefix_sep):
    # Step 1: Validate columns
    for col in categorical_cols:
        if col not in df.columns or not pd.api.types.is_categorical_dtype(df[col]):
            raise ValueError(f"Column '{col}' either not found or not categorical")

    # Step 2: Create dummies with specified prefix separator
    df = pd.get_dummies(df, columns=categorical_cols, prefix_sep=prefix_sep)
    
    return df

def discretize_target_column(df, target_col, bins, labels):
    if target_col not in df.columns:
        raise ValueError(f"Column '{target_col}' not found in DataFrame")
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError(f"Column '{target_col}' must be numeric")
    if len(bins) != len(labels) + 1:
        raise ValueError("Length of bins must be one more than the length of labels")
    
    df = df.copy()
    df[target_col + '_discretized'] = pd.cut(df[target_col], bins=bins, labels=labels)
    
    return df

def flag_time_gaps(df, datetime_col, interval):
    # Step 1: Validate column
    if datetime_col not in df.columns:
        raise ValueError(f"Column {datetime_col} is not present in DataFrame")
    
    # Step 2: Ensure datetime format
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    
    # Step 3: Calculate time differences
    df['time_diff'] = df[datetime_col].diff().dt.total_seconds()
    
    # Step 4: Create flag for time gaps
    interval_seconds = pd.Timedelta(interval).total_seconds()
    df['gap_flag'] = df['time_diff'] > interval_seconds
    
    return df

def tfidf_with_trends(df, text_columns, trends_history):
    df = df.copy()
    
    for col in text_columns:
        if col in df.columns:
            tfidf = TfidfVectorizer()
            tfidf_matrix = tfidf.fit_transform(df[col].fillna(''))
            tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
            
            # Multiply TF-IDF scores by trends history weights
            for term in tfidf_df.columns:
                if term in trends_history.index:
                    tfidf_df[term] *= trends_history[term]
                    
            tfidf_df = tfidf_df.add_prefix(f'{col}_tfidf_')
            df = pd.concat([df, tfidf_df], axis=1)
            
    return df

def compute_interactions(df, interaction_columns, method):
    # Validate presence and type of each column in interaction_columns
    for col in interaction_columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' not found or is not numeric in DataFrame")
    
    # Compute pairwise interactions based on specified method
    if method not in ['sum', 'multiply']:
        raise ValueError(f"Method '{method}' not supported. Use 'sum' or 'multiply'.")

    for i in range(len(interaction_columns)):
        for j in range(i + 1, len(interaction_columns)):
            col1, col2 = interaction_columns[i], interaction_columns[j]
            if method == 'sum':
                df[f'{col1}_{col2}_sum'] = df[col1] + df[col2]
            elif method == 'multiply':
                df[f'{col1}_{col2}_multiply'] = df[col1] * df[col2]
    
    return df

def flag_high_variance(df, target_col, window_size):
    if target_col not in df.columns or not pd.api.types.is_numeric_dtype(df[target_col]):
        raise ValueError("Target column is not found or is not numeric")

    df[f'{target_col}_rolling_mean'] = df[target_col].rolling(window=window_size).mean()
    std_dev = df[target_col].std()

    high_variance_condition = df[f'{target_col}_rolling_mean'] > 2 * std_dev
    df['high_variance'] = high_variance_condition

    return df
```

File: group_analysis.py:

```Python
import pandas as pd
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

def clean_text_data(df, text_col, stop_words):
    # Step 1: Validate the existence and string data type of the text column
    if text_col not in df.columns:
        raise ValueError(f"Text column '{text_col}' is not found in the DataFrame")
    if not pd.api.types.is_string_dtype(df[text_col]):
        raise TypeError(f"Text column '{text_col}' must be of string type")
    
    # Step 2: Remove punctuation from the text data
    df[text_col] = df[text_col].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # Step 3: Apply tokenization to convert sentences into lists of words
    df[text_col] = df[text_col].apply(lambda x: x.split())
    
    # Step 4: Remove specified stop words from the tokenized text
    stop_words_set = set(stop_words)
    df[text_col] = df[text_col].apply(lambda x: [word for word in x if word.lower() not in stop_words_set])
    
    return df

def resample_and_fill(df, time_col, period):
    if time_col not in df.columns or not pd.api.types.is_datetime64_any_dtype(df[time_col]):
        raise ValueError(f"Column '{time_col}' must be present and of datetime type in the DataFrame")

    df.set_index(time_col, inplace=True)
    resampled_df = df.resample(period).asfreq()

    resampled_df.fillna(method='ffill', inplace=True)
    resampled_df.fillna(method='bfill', inplace=True)

    resampled_df.reset_index(inplace=True)
    return resampled_df

def k_means_clustering(df, n_clusters):
    numeric_columns = df.select_dtypes(include='number').columns
    
    if len(numeric_columns) == 0:
        raise ValueError("DataFrame does not contain any numeric columns for clustering")
    
    df = df.copy()
    
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(df[numeric_columns])
    
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(normalized_data)
    
    df['cluster'] = clusters
    
    return df

def get_top_groups_by_mean(df, value_col, group_cols, max_num_groups):
    # Step 1: Validate the existence of all specified columns
    if value_col not in df.columns:
        raise ValueError(f"Value column '{value_col}' is not found in the DataFrame")
    for col in group_cols:
        if col not in df.columns:
            raise ValueError(f"Group column '{col}' is not found in the DataFrame")
        if not pd.api.types.is_numeric_dtype(df[value_col]):
            raise TypeError(f"Value column '{value_col}' must be of numeric type")

    # Step 2: Group the data based on the specified group columns
    grouped_df = df.groupby(group_cols)[value_col].mean().reset_index()
    
    # Step 3: Sort the groups based on the mean of the specified value column in descending order
    grouped_df = grouped_df.sort_values(by=value_col, ascending=False)
    
    # Step 4: Return the top 'max_num_groups' groups along with their means
    top_groups_df = grouped_df.head(max_num_groups)
    
    return top_groups_df

def resample_and_interpolate(df, time_col, new_freq):
    df[time_col] = pd.to_datetime(df[time_col])
    df.set_index(time_col, inplace=True)
    
    resampled_df = df.resample(new_freq).interpolate(method='linear')
    original_index = df.index
    
    resampled_df['was_missing'] = ~resampled_df.index.isin(original_index)
    
    return resampled_df.reset_index()

def validate_foreign_keys(df, primary_key, fk_dictionary):
    # Step 1: Ensure primary key column and foreign key columns exist
    if primary_key not in df.columns:
        raise ValueError(f"Primary key column {primary_key} does not exist in DataFrame")

    for fk_col, fk_table in fk_dictionary.items():
        if fk_col not in df.columns:
            raise ValueError(f"Foreign key column {fk_col} does not exist in DataFrame")

    # Step 2: Validate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        if not df[fk_col].isin(fk_table[primary_key]).all():
            print(f"Foreign key constraint violated on column {fk_col}")

    # Step 3: Remove rows that violate foreign key constraints
    for fk_col, fk_table in fk_dictionary.items():
        valid_rows = df[fk_col].isin(fk_table[primary_key])
        df = df[valid_rows]

    return df

def clean_and_reduce_features(df, feature_cols, target_col):
    dropped_features = []

    # Step 1: Remove outliers by bounding to 1st and 99th percentile
    for col in feature_cols:
        lower_bound = df[col].quantile(0.01)
        upper_bound = df[col].quantile(0.99)
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    
    # Step 2: Compute correlation and drop features with low correlation
    for col in feature_cols:
        correlation = df[col].corr(df[target_col])
        if abs(correlation) < 0.1:
            dropped_features.append(col)
            df.drop(columns=[col], inplace=True)

    # Step 3: Feature scaling to [0, 1] range
    remaining_cols = [col for col in feature_cols if col not in dropped_features]
    scaler = MinMaxScaler()
    df[remaining_cols] = scaler.fit_transform(df[remaining_cols])

    return df, dropped_features

def inner_join_dataframes(df1, df2, key):
    if key not in df1.columns or key not in df2.columns:
        raise ValueError(f"Column {key} is not in both dataframes")
    
    merged_df = pd.merge(df1, df2, on=key, how='inner')
    
    missing_in_df1 = df1[~df1[key].isin(merged_df[key])]
    missing_in_df2 = df2[~df2[key].isin(merged_df[key])]
    
    if not missing_in_df1.empty or not missing_in_df2.empty:
        print("Missing data detected in the key column. Handling missing data...")
    
    merged_df = merged_df.fillna(method='ffill').fillna(method='bfill')
    
    return merged_df

def remove_duplicates_and_count(df, target_col):
    # Identify duplicates
    duplicates = df.duplicated(subset=target_col, keep=False)
    df['duplicate_count'] = 0
    
    # Count duplicates and mark them
    df.loc[duplicates, 'duplicate_count'] = df.loc[duplicates, target_col].map(df.loc[duplicates, target_col].value_counts())
    
    # Drop duplicate rows, keeping the first occurrence
    df = df.drop_duplicates(subset=target_col, keep='first')
    
    return df

def temporal_averages(df, date_col, target_col):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['week'] = df[date_col].dt.isocalendar().week
    df['day_of_week'] = df[date_col].dt.dayofweek
    df['quarter'] = df[date_col].dt.quarter
    
    temporal_aggregations = ['year', 'month', 'week', 'day_of_week', 'quarter']
    
    for time_feature in temporal_aggregations:
        group_avgs = df.groupby(time_feature)[target_col].transform('mean')
        df[f'{target_col}_avg_by_{time_feature}'] = group_avgs
    
    return df

def filter_columns_by_std(df, threshold):
    # Identify numeric columns
    numeric_cols = df.select_dtypes(include='number').columns
    
    # Initialize an empty list for columns that meet our condition
    filtered_cols = []

    # Filter out columns with std deviation greater than threshold
    for col in numeric_cols:
        if df[col].std() > threshold:
            filtered_cols.append(col)

    # Create new dataframe
    new_df = df[filtered_cols]

    # Compute sum across all columns for each row
    new_df['row_sum'] = new_df.sum(axis=1)

    return new_df
```

--------------------------------------------------------------------------------------------------

Problem Statement: Parameters:
- df: pandas.DataFrame # The input DataFrame containing the data to be processed
- value_col: str # The target column for various operations, particularly for outlier detection and grouping
- missing_threshold: float # The threshold for dropping columns with excessive missing values
- target_col: str # The target column to be used for balancing and splitting datasets
- interaction_columns: list # The list of columns for which interactions must be calculated
- categories: list of str # The list of categorical columns to be transformed and encoded

Objectives:
- Validate the presence of the `value_col` in the DataFrame and ensure it is numeric. This is crucial for subsequent operations.
- Identify columns in the DataFrame that exceed the `missing_threshold` defined by the `missing_threshold` parameter and drop these columns. This ensures only quality data is used further in the function.
- Detect and replace outliers in the `value_col`, utilizing the IQR method, and calculate the total number of outliers detected. This clean-up step enhances the accuracy of the data.
- Compute interactions between numeric columns specified in the `interaction_columns` list using the 'multiply' method, as these interactions often represent complex relationships in the data.
- Perform target encoding on the categorical variables provided in the `categories` list using the `target_col`, which highlights the relation between categorical data and the target variable, thereby allowing for better predictive performance.
- Aggregate the results of the operations, including the cleaned DataFrame after imputation and outlier handling, interaction columns, and target-encoded columns, into a final DataFrame that effectively represents the processed state of the data.

Return Values:
- cleaned_df: pandas.DataFrame # The final DataFrame after processing which includes cleaned data, interactions, and encoded categorical features
- outlier_count: int # The total number of outliers detected in the value_col during processing

The name of the function you create should be complex_data_processing

--------------------------------------------------------------------------------------------------

Answer Code:
def complex_data_processing(df, value_col, missing_threshold, target_col, interaction_columns, categories):
    # Step 1: Validate the value_col
    if value_col not in df.columns or not pd.api.types.is_numeric_dtype(df[value_col]):
        raise ValueError(f"Column '{value_col}' not found or not numeric in DataFrame")
    # Step 2: Drop columns with excessive missing values
    df = drop_and_impute_missing_values(df, missing_threshold)
    
    # Step 3: Outlier detection and correction in the value_col
    df, outlier_count = outlier_detection_and_correction(df, value_col)
    # Step 4: Compute interactions
    df = compute_interactions(df, interaction_columns, method='multiply')
    
    # Step 5: Target encoding for categorical columns
    df = categorical_transform(df, categories, target=target_col)
    
    # Step 6: Aggregate results and prepare the final cleaned DataFrame
    # Handle additional transformation logic, if necessary, for final presentation
    
    return df, outlier_count

--------------------------------------------------------------------------------------------------

Test Code:
# Import statements if required
import pandas as pd
import numpy as np

# Import function from file
from data_processing import complex_data_processing

# Initialize input parameters
data = {
    'A': [1, 2, 3, np.nan, 100],
    'B': [5, 6, 7, 8, 9],
    'C': ['cat', 'dog', 'cat', 'dog', np.nan],
    'target': [10, 20, 10, 20, 30]
}
df = pd.DataFrame(data)
value_col = 'A'
missing_threshold = 0.4
target_col = 'target'
interaction_columns = ['A', 'B']
categories = ['C']

# Call function with input parameters
return_df, return_outlier_count = complex_data_processing(df, value_col, missing_threshold, target_col, interaction_columns, categories)

# Step-by-step run-through of function to obtain intermediate outputs:

# Step 1
# Explanation: Validate the value_col
if value_col not in df.columns or not pd.api.types.is_numeric_dtype(df[value_col]):
    raise ValueError(f"Column '{value_col}' not found or not numeric in DataFrame")

# Step 2
# Explanation: Drop columns with excessive missing values
missing_percent = df.isnull().mean()
cols_to_drop = missing_percent[missing_percent > missing_threshold].index
df_cleaned = df.drop(columns=cols_to_drop)

# Impute remaining missing values with median for numerical and mode for categorical
num_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df_cleaned.select_dtypes(include=['object']).columns

for col in num_cols:
    df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
        
for col in cat_cols:
    df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)

# Step 3
# Explanation: Outlier detection and correction in the value_col (Using IQR)
Q1 = df_cleaned[value_col].quantile(0.25)
Q3 = df_cleaned[value_col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_cleaned['is_outlier'] = (df_cleaned[value_col] < lower_bound) | (df_cleaned[value_col] > upper_bound)

# Replace outliers with the median value
median_value = df_cleaned[value_col].median()
df_cleaned.loc[df_cleaned['is_outlier'], value_col] = median_value

# Count the number of outliers
outlier_count = df_cleaned['is_outlier'].sum()

# Step 4
# Explanation: Compute interactions
for i in range(len(interaction_columns)):
    for j in range(i + 1, len(interaction_columns)):
        col1, col2 = interaction_columns[i], interaction_columns[j]
        df_cleaned[f'{col1}_{col2}_multiply'] = df_cleaned[col1] * df_cleaned[col2]

# Step 5
# Explanation: Target encoding for categorical columns
for cat in categories:
    freq_col = f"{cat}_freq"
    prop_col = f"{cat}_prop"
    target_enc_col = f"{cat}_target_enc"
    
    freq_counts = df_cleaned[cat].value_counts()
    total_counts = freq_counts.sum()
    
    df_cleaned[freq_col] = df_cleaned[cat].map(freq_counts)
    df_cleaned[prop_col] = df_cleaned[freq_col] / total_counts
    
    target_means = df_cleaned.groupby(cat)[target_col].mean()
    df_cleaned[target_enc_col] = df_cleaned[cat].map(target_means)

# Final Expected Output:
correct_df = df_cleaned.drop(columns=['is_outlier'])
correct_outlier_count = outlier_count

# Assert statements (compulsory) to check if the function returns the correct values:
assert return_df.equals(correct_df)
assert return_outlier_count == correct_outlier_count

print('All-Pass')

--------------------------------------------------------------------------------------------------

