"""
The benchmark of Grinsztajn at al. 22.

See https://arxiv.org/abs/2207.08815
"""
from typing import Dict, Callable

import pandas as pd

from tableshift.core.features import Feature, FeatureList

ELECTRICITY_FEATURES = FeatureList(features=[
    Feature('price_increase', int, is_target=True,
            description='Preprocessed version of original class label ('
                        'UP/DOWN to integer values). The '
                        'class label identifies the change of the price ( '
                        'UP or DOWN) in New South Wales relative to a moving '
                        'average of the last 24 hours (and removes the impact '
                        'of longer term price trends).',
            name_extended='Increase in the price of electricity relative to a moving '
                          'average of the last 24 hours'),
    Feature('date', float,
            'date between 7 May 1996 to 5 December 1998. Here normalized between 0 and 1',
            name_extended='Relative date between 7 May 1996 and 5 December 1998'),
    Feature('day', int, name_extended='day of the week',
            value_mapping={
                0: 'Monday',
                1: 'Tuesday',
                2: 'Wednesday',
                3: 'Thursday',
                4: 'Friday',
                5: 'Saturday',
                6: 'Sunday'}),
    Feature('period', float,
            'Time of the measurement (1-48) in half hour intervals over 24 hours. Here normalized between 0 and 1',
            name_extended='Time of the measurement relative to 24 hours (normalized between 0 and 1)'),
    Feature('nswprice', float,
            'New South Wales electricity price, normalized between 0 and 1',
            name_extended='New South Wales electricity demand relative to min and max price'),
    Feature('nswdemand', float,
            'New South Wales electricity demand, normalized between 0 and 1',
            name_extended='New South Wales electricity demand relative to min and max demand'),
    Feature('vicprice', float,
            'Victoria electricity price, normalized between 0 and 1',
            name_extended='Victoria electricity demand relative to min and max price'),
    Feature('vicdemand', float,
            'Victoria electricity demand, normalized between 0 and 1',
            name_extended='Victoria electricity demand relative to min and max demand'),
    Feature('transfer', float,
            name_extended='scheduled electricity transfer between both states, normalized between 0 and 1')

],
    documentation='https://www.openml.org/d/44156')

BANK_MARKETING_FEATURES = FeatureList(features=[
    Feature('default', int, "default: has credit in default? (binary: yes,no)",
            is_target=True,
            name_extended="Had credit in default"),
    Feature('V1', int, "age", name_extended="age"),
    Feature('V6', float, "balance: average yearly balance, in euros (numeric)",
            name_extended="Average yearly balance in Euros"),
    Feature('V10', int, 'day: last contact day of the month (numeric)',
            name_extended='number of last contact day of the month'),
    Feature('V12', float,
            'duration: last contact duration, in seconds (numeric)',
            name_extended='duration of last contact in seconds'),
    Feature('V13', int,
            'campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)',
            name_extended='number of days performed during campaign for this client'),
    Feature('V14', float,
            'pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)',
            name_extended='number of days since client was last contacted'),
    Feature('V15', int,
            'previous: number of contacts performed before this campaign and for this client (numeric)',
            name_extended='number of contacts prior to this campaign'),
],
    documentation="https://www.openml.org/d/44126")

CALIFORNIA_FEATURES = FeatureList(features=[
    Feature('price_above_median', int, 'price above median',
            is_target=True,
            name_extended='home price in this census block group is above median'),
    Feature('Longitude', float),
    Feature('Latitude', float),
    Feature('AveOccup', float,
            'see https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html',
            name_extended='average occupancy of houses in census block group'),
    Feature('Population', float,
            'see https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html',
            name_extended='population of census block group'),
    Feature('AveBedrms', float,
            'see https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html',
            name_extended='average number of bedrooms of homes in census block group'
            ),
    Feature('HouseAge', int,
            'see https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html',
            name_extended='age of house'),
    Feature('MedInc', float,
            'see https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html',
            name_extended='median income in census block group'),
], documentation="https://www.openml.org/d/44090")

COVERTYPE_FEATURES = FeatureList(features=[
    Feature('class', int, is_target=True,
            name_extended="forest cover type designation is Lodgepole Pine"),
    Feature('Elevation', float, description='Elevation in meters',
            name_extended='normalized elevation in meters'),
    Feature('Aspect', float, 'Aspect in degrees azimuth',
            name_extended='normalized aspect in degrees azimuth'),
    Feature('Slope', int, 'Slope in degrees',
            name_extended='normalized slope in degrees'),
    Feature('Horizontal_Distance_To_Hydrology', float,
            'Horz Dist to nearest surface water features',
            name_extended='normalized horizontal distance to nearest surface water features'),
    Feature('Vertical_Distance_To_Hydrology', float,
            'Vert Dist to nearest surface water features',
            name_extended='normalized vertical distance to nearest surface water features'),
    Feature('Horizontal_Distance_To_Roadways', float,
            'Horz Dist to nearest roadway',
            name_extended='normalized distance to nearest roadway'),
    Feature('Hillshade_9am', int, 'Hillshade index at 9am on summer solstice',
            name_extended='normalized Hillshade index at 9am, summer solstice'),
    Feature('Hillshade_Noon', int, 'Hillshade index at noon, summer solstice',
            name_extended='normalized Hillshade index at noon on summer solstice'),
    Feature('Hillshade_3pm', int, 'Hillshade index at 3pm, summer solstice',
            name_extended='normalized Hillshade index at 3pm on summer solstice'),
    Feature('Horizontal_Distance_To_Fire_Points', float,
            'Horz Dist to nearest wildfire ignition points',
            name_extended='normalized horizontal distance to nearest wildfire ignition points'),
    Feature('Wilderness_Area1', int,
            name_extended='indicator for wilderness area type 1 designation'),
    Feature('Wilderness_Area2', int,
            name_extended='indicator for wilderness area type 2 designation'),
    Feature('Wilderness_Area3', int,
            name_extended='indicator for wilderness area type 3 designation'),
    Feature('Wilderness_Area4', int,
            name_extended='indicator for wilderness area type 4 designation'),
    Feature('Soil_Type1', int,
            name_extended='indicator for soil type 1 designation'),
    Feature('Soil_Type2', int,
            name_extended='indicator for soil type 2 designation'),
    Feature('Soil_Type3', int,
            name_extended='indicator for soil type 3 designation'),
    Feature('Soil_Type4', int,
            name_extended='indicator for soil type 4 designation'),
    Feature('Soil_Type5', int,
            name_extended='indicator for soil type 5 designation'),
    Feature('Soil_Type6', int,
            name_extended='indicator for soil type 6 designation'),
    Feature('Soil_Type7', int,
            name_extended='indicator for soil type 7 designation'),
    Feature('Soil_Type8', int,
            name_extended='indicator for soil type 8 designation'),
    Feature('Soil_Type9', int,
            name_extended='indicator for soil type 9 designation'),
    Feature('Soil_Type10', int,
            name_extended='indicator for soil type 10 designation'),
    Feature('Soil_Type11', int,
            name_extended='indicator for soil type 11 designation'),
    Feature('Soil_Type12', int,
            name_extended='indicator for soil type 12 designation'),
    Feature('Soil_Type13', int,
            name_extended='indicator for soil type 13 designation'),
    Feature('Soil_Type14', int,
            name_extended='indicator for soil type 14 designation'),
    Feature('Soil_Type15', int,
            name_extended='indicator for soil type 15 designation'),
    Feature('Soil_Type16', int,
            name_extended='indicator for soil type 16 designation'),
    Feature('Soil_Type17', int,
            name_extended='indicator for soil type 17 designation'),
    Feature('Soil_Type18', int,
            name_extended='indicator for soil type 18 designation'),
    Feature('Soil_Type19', int,
            name_extended='indicator for soil type 19 designation'),
    Feature('Soil_Type20', int,
            name_extended='indicator for soil type 20 designation'),
    Feature('Soil_Type21', int,
            name_extended='indicator for soil type 21 designation'),
    Feature('Soil_Type22', int,
            name_extended='indicator for soil type 22 designation'),
    Feature('Soil_Type23', int,
            name_extended='indicator for soil type 23 designation'),
    Feature('Soil_Type24', int,
            name_extended='indicator for soil type 24 designation'),
    Feature('Soil_Type25', int,
            name_extended='indicator for soil type 25 designation'),
    Feature('Soil_Type26', int,
            name_extended='indicator for soil type 26 designation'),
    Feature('Soil_Type27', int,
            name_extended='indicator for soil type 27 designation'),
    Feature('Soil_Type28', int,
            name_extended='indicator for soil type 28 designation'),
    Feature('Soil_Type29', int,
            name_extended='indicator for soil type 29 designation'),
    Feature('Soil_Type30', int,
            name_extended='indicator for soil type 30 designation'),
    Feature('Soil_Type31', int,
            name_extended='indicator for soil type 31 designation'),
    Feature('Soil_Type32', int,
            name_extended='indicator for soil type 32 designation'),
    Feature('Soil_Type33', int,
            name_extended='indicator for soil type 33 designation'),
    Feature('Soil_Type34', int,
            name_extended='indicator for soil type 34 designation'),
    Feature('Soil_Type35', int,
            name_extended='indicator for soil type 35 designation'),
    Feature('Soil_Type36', int,
            name_extended='indicator for soil type 36 designation'),
    Feature('Soil_Type37', int,
            name_extended='indicator for soil type 37 designation'),
    Feature('Soil_Type38', int,
            name_extended='indicator for soil type 38 designation'),
    Feature('Soil_Type39', int,
            name_extended='indicator for soil type 39 designation'),
    Feature('Soil_Type40', int,
            name_extended='indicator for soil type 40 designation'),
], documentation="""https://www.openml.org/d/44159, https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#covtype.binary.
                The original binary split is due to Collobert, Bengio and Bengio,
                'Parallel Mixture of SVMs for Very Large Scale Problems' and
                the class IDs are defined in the UCI dataset file covtype.info
                at https://archive.ics.uci.edu/ml/datasets/covertype .""")

CREDIT_FEATURES = FeatureList(features=[
    Feature('SeriousDlqin2yrs', int, is_target=True,
            name_extended='person will experience serious deliquency within the next 2 years'),
    Feature('RevolvingUtilizationOfUnsecuredLines', float,
            name_extended='revolving utilization of unsecured credit lines'),
    Feature('age', int),
    Feature('NumberOfTime30-59DaysPastDueNotWorse', int,
            name_extended='number of times between 30 and 59 days past due'),
    Feature('NumberOfTime60-89DaysPastDueNotWorse', int,
            name_extended='numer of time between 60 and 89 days past due'),
    Feature('NumberOfTimes90DaysLate', int,
            name_extended='number of times 90 days late'),
    Feature('DebtRatio', float, name_extended='debt ratio'),
    Feature('MonthlyIncome', float, name_extended='monthly income'),
    Feature('NumberOfOpenCreditLinesAndLoans', int,
            name_extended='number of open credit lines and loans'),
    Feature('NumberRealEstateLoansOrLines', int,
            name_extended='numer of real estate loans or lines'),
    Feature('NumberOfDependents', int, name_extended='number of dependents'),
])

PAYMENT_STATUS_MAPPING = {
    -2.0: 'unknown or paid early',
    -1.0: 'paid duly',
    0.0: 'payment delay for less than 1 month',
    1.0: 'payment delay for 1 months',
    2.0: 'payment delay for 2 months',
    3.0: 'payment delay for 3 months',
    4.0: 'payment delay for 4 months',
    5.0: 'payment delay for 5 months',
    6.0: 'payment delay for 6 months',
    7.0: 'payment delay for 7 months',
    8.0: 'payment delay for 8 months',
    9.0: 'payment delay for 9 months and above',
}

DEFAULT_OF_CREDIT_CLIENTS_FEATURES = FeatureList(features=[
    Feature('x1', float,
            'Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.',
            name_extended='total credit amount in NT dollars'),
    Feature('x2', int, 'gender', name_extended='gender',
            value_mapping={0: 'male', 1: 'female'}),
    Feature('x5', int, 'age', name_extended='age in years'),
    Feature('x6', float, name_extended='repayment status in September 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x7', float, name_extended='repayment status in August 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x8', float, name_extended='repayment status in July 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x9', float, name_extended='repayment status in June 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x10', float, name_extended='repayment status in May 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x11', float, name_extended='repayment status in April 2005',
            value_mapping=PAYMENT_STATUS_MAPPING),
    Feature('x12', float,
            name_extended='amount of bill statement in September 2005 in NT dollars'),
    Feature('x13', float,
            name_extended='amount of bill statement in August 2005 in NT dollars'),
    Feature('x14', float,
            name_extended='amount of bill statement in July 2005 in NT dollars'),
    Feature('x15', float,
            name_extended='amount of bill statement in June 2005 in NT dollars'),
    Feature('x16', float,
            name_extended='amount of bill statement in May 2005 in NT dollars'),
    Feature('x17', float,
            name_extended='amount of bill statement in April 2005 in NT dollars'),
    Feature('x18', float,
            name_extended='amount paid in September 2005 in NT dollars'),
    Feature('x19', float,
            name_extended='amount paid in August 2005 in NT dollars'),
    Feature('x20', float,
            name_extended='amount paid in July 2005 in NT dollars'),
    Feature('x21', float,
            name_extended='amount paid in June 2005 in NT dollars'),
    Feature('x22', float,
            name_extended='amount paid in May 2005 in NT dollars'),
    Feature('x23', float,
            name_extended='amount paid in April 2005 in NT dollars'),
    Feature('y', int, is_target=True,
            name_extended='client defaults on payment'),
],
    documentation="https://www.openml.org/search?type=data&status=active&id=45016")

EYE_MOVEMENTS_FEATURES = FeatureList(features=[
    Feature('lineNo', float, name_extended='line number'),
    Feature('assgNo', float, name_extended='assignment number'),
    Feature('P1stFixation', int,
            " '1' if fixation occured when the sentence the word was in was encountered the first time",
            name_extended="indicator for whether fixation occurred when the sentence containing the word was encountered the first time"),
    Feature('P2stFixation', int,
            " '1' if fixation occured when the sentence the word was in was encountered the second time",
            name_extended="indicator for whether fixation occurred when the sentence containing the word was encountered the second time"),
    Feature('prevFixDur', float,
            name_extended='duration of previous fixation in milliseconds'),
    Feature('firstfixDur', float,
            name_extended='duration (in milliseconds) of the first fixation when the word is first encountered'),
    Feature('firstPassFixDur', float,
            name_extended='Sum of durations (in milliseconds) of fixations when the word is first encountered'),
    Feature('nextFixDur', float,
            name_extended='Duration (in milliseconds) of the next fixation when gaze initially moves from the word'),
    Feature('firstSaccLen', float, name_extended='Length of the first saccade'),
    Feature('lastSaccLen', float,
            name_extended='Distance (in pixels) between fixation on the word and the next fixation'),
    Feature('prevFixPos', float,
            name_extended='Distance (in pixels) between the first fixation preceding the word and the beginning of the word'),
    Feature('landingPos', float,
            name_extended='Distance (in pixels) between the first fixation on the word and the beginning of the word'),
    Feature('leavingPos', float,
            name_extended='Distance (in pixels) between the last fixation on the word and the beginning of the word'),
    Feature('totalFixDur', float,
            name_extended='Sum of all durations (in milliseconds) of fixations to the word'),
    Feature('meanFixDur', float,
            name_extended='Mean duration (in milliseconds) of the fixations to the word'),
    Feature('regressLen', float,
            name_extended='Sum of durations (in milliseconds) of regressions initiating from this word'),
    Feature('nextWordRegress', int,
            "'1' if a regression initiated from the following word",
            name_extended="indicator for whether a regression was initiated from the following word"),
    Feature('regressDur', float,
            name_extended='Sum of durations (in milliseconds) of the fixations on the word during regression'),
    Feature('pupilDiamMax', float, name_extended='Maximum pupil diameter'),
    Feature('pupilDiamLag', float,
            name_extended='Maximum pupil diameter 0.5 - 1.5 seconds after the beginning of fixation'),
    Feature('timePrtctg', float,
            'First fixation duration divided by the total number of fixations'),
    Feature('titleNo', int, name_extended='Title number'),
    Feature('wordNo', int, name_extended='Word number (ordinal) in this title'),
    Feature('label', int, is_target=True,
            name_extended="sentence is irrelevant to the question",
            value_mapping={1: 'irrelevant', 0: 'relevant'},
            ),
], documentation="https://www.openml.org/d/44157, "
                 "http://research.ics.aalto.fi/events/eyechallenge2005/irem-2005-03-03.pdf")

HIGGS_FEATURES = FeatureList(features=[
    Feature('target', int, name_extended='is signal', is_target=True),
    Feature('lepton_pT', float, name_extended='kinematic property lepton pT'),
    Feature('lepton_eta', float, name_extended='kinematic property lepton eta'),
    Feature('lepton_phi', float, name_extended='kinematic property lepton phi'),
    Feature('missing_energy_magnitude', float,
            name_extended='kinematic property missing energy magnitude'),
    Feature('missing_energy_phi', float,
            name_extended='kinematic property missing energy phi'),
    Feature('jet_1_pt', float, name_extended='kinematic property jet 1 pT'),
    Feature('jet_1_eta', float, name_extended='kinematic property jet 1 eta'),
    Feature('jet_1_phi', float, name_extended='kinematic property jet 1 phi'),
    Feature('jet_2_pt', float, name_extended='kinematic property jet 2 pT'),
    Feature('jet_2_eta', float, name_extended='kinematic property jet 2 eta'),
    Feature('jet_2_phi', float, name_extended='kinematic property jet 2 phi'),
    Feature('jet_3_pt', float, name_extended='kinematic property jet 3 pT'),
    Feature('jet_3_eta', float, name_extended='kinematic property jet 3 eta'),
    Feature('jet_3_phi', float, name_extended='kinematic property jet 3 phi'),
    Feature('jet_4_pt', float, name_extended='kinematic property jet 4 pT'),
    Feature('jet_4_eta', float, name_extended='kinematic property jet 4 eta'),
    Feature('jet_4_phi', float, name_extended='kinematic property jet 4 phi'),
    Feature('m_jj', float, name_extended='derived feature m_jj'),
    Feature('m_jjj', float, name_extended='derived feature m_jjj'),
    Feature('m_lv', float, name_extended='derived feature m_lv'),
    Feature('m_jlv', float, name_extended='derived feature m_jlv'),
    Feature('m_bb', float, name_extended='derived feature m_bb'),
    Feature('m_wbb', float, name_extended='derived feature m_wbb'),
    Feature('m_wwbb', float, name_extended='derived feature m_wwbb'),
], documentation='https://www.openml.org/d/44129')

MAGIC_TELESCOPE_FEATURES = FeatureList(features=[
    Feature('fLength:', float,
            name_extended='length of major axis of ellipse in mm'),
    Feature('fWidth:', float,
            name_extended='length of minor axis of ellipse in mm'),
    Feature('fSize:', float,
            name_extended='log base 10 of sum of content of all pixel values in image (fSize)'),
    Feature('fConc:', float,
            name_extended='ratio of sum of two highest pixel values to fSize'),
    Feature('fConc1:', float,
            name_extended='ratio of largest pixel value to fSize'),
    Feature('fAsym:', float,
            name_extended='distance from highest pixel to center, projected onto major axis in mm'),
    Feature('fM3Long:', float,
            name_extended='3rd root of third moment along major axis in mm'),
    Feature('fM3Trans:', float,
            name_extended='3rd root of third moment along minor axis in mm'),
    Feature('fAlpha:', float,
            name_extended='angle of major axis with vector to origin in degrees'),
    Feature('fDist:', float,
            name_extended='distance from origin to center of ellipse in mm'),
    Feature('is_signal', int, is_target=True,
            name_extended='input is gamma (signal) and not hadron (background)')
], documentation='https://www.openml.org/d/44125')

MINI_BOONE_FEATURES = FeatureList(features=[
    Feature('ParticleID_0', float, name_extended='particle ID 0 value'),
    Feature('ParticleID_1', float, name_extended='particle ID 1 value'),
    Feature('ParticleID_2', float, name_extended='particle ID 2 value'),
    Feature('ParticleID_3', float, name_extended='particle ID 3 value'),
    Feature('ParticleID_4', float, name_extended='particle ID 4 value'),
    Feature('ParticleID_5', float, name_extended='particle ID 5 value'),
    Feature('ParticleID_6', float, name_extended='particle ID 6 value'),
    Feature('ParticleID_7', float, name_extended='particle ID 7 value'),
    Feature('ParticleID_8', float, name_extended='particle ID 8 value'),
    Feature('ParticleID_9', float, name_extended='particle ID 9 value'),
    Feature('ParticleID_10', float, name_extended='particle ID 10 value'),
    Feature('ParticleID_11', float, name_extended='particle ID 11 value'),
    Feature('ParticleID_12', float, name_extended='particle ID 12 value'),
    Feature('ParticleID_13', float, name_extended='particle ID 13 value'),
    Feature('ParticleID_14', float, name_extended='particle ID 14 value'),
    Feature('ParticleID_15', float, name_extended='particle ID 15 value'),
    Feature('ParticleID_16', float, name_extended='particle ID 16 value'),
    Feature('ParticleID_17', float, name_extended='particle ID 17 value'),
    Feature('ParticleID_18', float, name_extended='particle ID 18 value'),
    Feature('ParticleID_19', float, name_extended='particle ID 19 value'),
    Feature('ParticleID_20', float, name_extended='particle ID 20 value'),
    Feature('ParticleID_21', float, name_extended='particle ID 21 value'),
    Feature('ParticleID_22', float, name_extended='particle ID 22 value'),
    Feature('ParticleID_23', float, name_extended='particle ID 23 value'),
    Feature('ParticleID_24', float, name_extended='particle ID 24 value'),
    Feature('ParticleID_25', float, name_extended='particle ID 25 value'),
    Feature('ParticleID_26', float, name_extended='particle ID 26 value'),
    Feature('ParticleID_27', float, name_extended='particle ID 27 value'),
    Feature('ParticleID_28', float, name_extended='particle ID 28 value'),
    Feature('ParticleID_29', float, name_extended='particle ID 29 value'),
    Feature('ParticleID_30', float, name_extended='particle ID 30 value'),
    Feature('ParticleID_31', float, name_extended='particle ID 31 value'),
    Feature('ParticleID_32', float, name_extended='particle ID 32 value'),
    Feature('ParticleID_33', float, name_extended='particle ID 33 value'),
    Feature('ParticleID_34', float, name_extended='particle ID 34 value'),
    Feature('ParticleID_35', float, name_extended='particle ID 35 value'),
    Feature('ParticleID_36', float, name_extended='particle ID 36 value'),
    Feature('ParticleID_37', float, name_extended='particle ID 37 value'),
    Feature('ParticleID_38', float, name_extended='particle ID 38 value'),
    Feature('ParticleID_39', float, name_extended='particle ID 39 value'),
    Feature('ParticleID_40', float, name_extended='particle ID 40 value'),
    Feature('ParticleID_41', float, name_extended='particle ID 41 value'),
    Feature('ParticleID_42', float, name_extended='particle ID 42 value'),
    Feature('ParticleID_43', float, name_extended='particle ID 43 value'),
    Feature('ParticleID_44', float, name_extended='particle ID 44 value'),
    Feature('ParticleID_45', float, name_extended='particle ID 45 value'),
    Feature('ParticleID_46', float, name_extended='particle ID 46 value'),
    Feature('ParticleID_47', float, name_extended='particle ID 47 value'),
    Feature('ParticleID_48', float, name_extended='particle ID 48 value'),
    Feature('ParticleID_49', float, name_extended='particle ID 49 value'),
    Feature('signal', int, is_target=True,
            name_extended='is electron neutrino (signal) vs. muon neutrino (background)'),
], documentation='https://www.openml.org/d/44128')

# Note: the descriptions for these features are not good, and no value
# mapping is available.
ROAD_SAFETY_FEATURES = FeatureList(features=[
    Feature('Vehicle_Reference_df_res', int),
    Feature('Vehicle_Type', int, name_extended='vehicle type',
            value_mapping={
                1: 'Pedal cycle', 2: 'Motorcycle 50cc and under',
                3: 'Motorcycle 125cc and under',
                4: 'Motorcycle over 125cc and up to 500cc',
                5: 'Motorcycle over 500cc', 8: 'Taxi/Private hire car',
                9: 'Car', 10: 'Minibus (8 - 16 passenger seats)',
                11: 'Bus or coach (17 or more pass seats)', 16: 'Ridden horse',
                17: 'Agricultural vehicle', 18: 'Tram',
                19: 'Van / Goods 3.5 tonnes mgw or under',
                20: 'Goods over 3.5t. and under 7.5t',
                21: 'Goods 7.5 tonnes mgw and over', 22: 'Mobility scooter',
                23: 'Electric motorcycle', 90: 'Other vehicle',
                97: 'Motorcycle - unknown cc',
                98: 'Goods vehicle - unknown weight',
                99: 'Unknown vehicle type (self rep only)',
                103: 'Motorcycle - Scooter (1979-1998)',
                104: 'Motorcycle (1979-1998)',
                105: 'Motorcycle - Combination (1979-1998)',
                106: 'Motorcycle over 125cc (1999-2004)',
                108: 'Taxi (excluding private hire cars) (1979-2004)',
                109: 'Car (including private hire cars) (1979-2004)',
                110: 'Minibus/Motor caravan (1979-1998)',
                113: 'Goods over 3.5 tonnes (1979-1998)',
                -1: 'Data missing or out of range'}),
    Feature('Vehicle_Manoeuvre', int,
            name_extended='vehicle maneuver type',
            value_mapping={
                -1: 'Data missing or out of range', 1: 'Reversing', 2: 'Parked',
                3: 'Waiting to go - held up', 4: 'Slowing or stopping',
                5: 'Moving off', 6: 'U-turn', 7: 'Turning left',
                8: 'Waiting to turn left', 9: 'Turning right',
                10: 'Waiting to turn right', 11: 'Changing lane to left',
                12: 'Changing lane to right',
                13: 'Overtaking moving vehicle - offside',
                14: 'Overtaking static vehicle - offside',
                15: 'Overtaking - nearside', 16: 'Going ahead left-hand bend',
                17: 'Going ahead right-hand bend', 18: 'Going ahead other',
                99: 'unknown (self reported)'}),
    Feature('Vehicle_Location-Restricted_Lane', int,
            name_extended='vehicle location lane restriction type',
            value_mapping={
                0: "On main c'way - not in restricted lane",
                1: "Tram/Light rail track", 2: "Bus lane",
                3: "Busway (including guided busway)",
                4: "Cycle lane (on main carriageway)",
                5: "Cycleway or shared use footway (not part of  main carriageway)",
                6: "On lay-by or hard shoulder",
                7: "Entering lay-by or hard shoulder",
                8: "Leaving lay-by or hard shoulder", 9: "Footway (pavement)",
                10: "Not on carriageway", 99: "unknown (self reported)",
                -1: "Data missing or out of range",
            }),
    Feature('Hit_Object_in_Carriageway', int,
            name_extended='hit object in carriageway',
            value_mapping={
                0: "None", 1: "Previous accident", 2: "Road works",
                4: "Parked vehicle", 5: "Bridge (roof)", 6: "Bridge (side)",
                7: "Bollard or refuge", 8: "Open door of vehicle",
                9: "Central island of roundabout", 10: "Kerb",
                11: "Other object", 12: "Any animal (except ridden horse)",
                99: "unknown (self reported)",
                -1: "Data missing or out of range"
            }),
    Feature('Hit_Object_off_Carriageway', int,
            name_extended='hit object off carriageway'),
    Feature('Was_Vehicle_Left_Hand_Drive?', int,
            name_extended='indicator for whether vehicle was left-hand drive',
            value_mapping={
                1: 'No',
                2: 'Yes',
                9: 'Unknown',
                -1: 'Data missing or out of range'}),
    Feature('Age_of_Driver', int,
            name_extended='age of driver (in years)'),
    Feature('Age_Band_of_Driver', int,
            name_extended='age band of driver',
            value_mapping={
                1: '0 - 5', 2: '6 - 10', 3: '11 - 15', 4: '16 - 20',
                5: '21 - 25', 6: '26 - 35', 7: '36 - 45', 8: '46 - 55',
                9: '56 - 65', 10: '66 - 75', 11: 'Over 75'
            }),
    Feature('Engine_Capacity_(CC)', float,
            name_extended='engine capacity in cc'),
    Feature('Propulsion_Code', int, name_extended='propulsion code',
            value_mapping={
                1: "Petrol", 2: "Heavy oil", 3: "Electric", 4: "Steam",
                5: "Gas", 6: "Petrol/Gas (LPG)", 7: "Gas/Bi-fuel",
                8: "Hybrid electric", 9: "Gas Diesel",
                10: "New fuel technology", 11: "Fuel cells",
                12: "Electric diesel", -1: "Undefined"
            }),
    Feature('Age_of_Vehicle', int, name_extended='age of vehicle in years'),
    Feature('Location_Easting_OSGR', float,
            name_extended='Ordnance Survey Grid References (OSGR) Easting coordinate'),
    Feature('Location_Northing_OSGR', float,
            name_extended='Ordnance Survey Grid References (OSGR) Northing coordinate'),
    Feature('Longitude', float),
    Feature('Latitude', float),
    Feature('Police_Force', int, name_extended='police force code',
            value_mapping={
                1: 'Metropolitan Police', 3: 'Cumbria', 4: 'Lancashire',
                5: 'Merseyside', 6: 'Greater Manchester', 7: 'Cheshire',
                10: 'Northumbria', 11: 'Durham', 12: 'North Yorkshire',
                13: 'West Yorkshire', 14: 'South Yorkshire', 16: 'Humberside',
                17: 'Cleveland', 20: 'West Midlands', 21: 'Staffordshire',
                22: 'West Mercia', 23: 'Warwickshire', 30: 'Derbyshire',
                31: 'Nottinghamshire', 32: 'Lincolnshire', 33: 'Leicestershire',
                34: 'Northamptonshire', 35: 'Cambridgeshire', 36: 'Norfolk',
                37: 'Suffolk', 40: 'Bedfordshire', 41: 'Hertfordshire',
                42: 'Essex', 43: 'Thames Valley', 44: 'Hampshire', 45: 'Surrey',
                46: 'Kent', 47: 'Sussex', 48: 'City of London',
                50: 'Devon and Cornwall', 52: 'Avon and Somerset',
                53: 'Gloucestershire', 54: 'Wiltshire', 55: 'Dorset',
                60: 'North Wales', 61: 'Gwent', 62: 'South Wales',
                63: 'Dyfed-Powys', 91: 'Northern', 92: 'Grampian',
                93: 'Tayside', 94: 'Fife', 95: 'Lothian and Borders',
                96: 'Central', 97: 'Strathclyde', 98: 'Dumfries and Galloway',
                99: 'Police Scotland',
            }),
    Feature('Number_of_Vehicles', int,
            name_extended='number of vehicles involved'),
    Feature('Number_of_Casualties', int, name_extended='number of casualties'),
    Feature('Local_Authority_(District)', float,
            name_extended='local authority (district)',
            value_mapping={
                1: "Westminster", 2: "Camden", 3: "Islington", 4: "Hackney",
                5: "Tower Hamlets", 6: "Greenwich", 7: "Lewisham",
                8: "Southwark", 9: "Lambeth", 10: "Wandsworth",
                11: "Hammersmith and Fulham", 12: "Kensington and Chelsea",
                13: "Waltham Forest", 14: "Redbridge", 15: "Havering",
                16: "Barking and Dagenham", 17: "Newham", 18: "Bexley",
                19: "Bromley", 20: "Croydon", 21: "Sutton", 22: "Merton",
                23: "Kingston upon Thames", 24: "Richmond upon Thames",
                25: "Hounslow", 26: "Hillingdon", 27: "Ealing", 28: "Brent",
                29: "Harrow", 30: "Barnet", 31: "Haringey", 32: "Enfield",
                33: "Hertsmere", 38: "Epsom and Ewell", 40: "Spelthorne",
                57: "London Airport (Heathrow)", 60: "Allerdale",
                61: "Barrow-in-Furness", 62: "Carlisle", 63: "Copeland",
                64: "Eden", 65: "South Lakeland", 70: "Blackburn with Darwen",
                71: "Blackpool", 72: "Burnley", 73: "Chorley", 74: "Fylde",
                75: "Hyndburn", 76: "Lancaster", 77: "Pendle", 79: "Preston",
                80: "Ribble Valley", 82: "Rossendale", 83: "South Ribble",
                84: "West Lancashire", 85: "Wyre", 90: "Knowsley",
                91: "Liverpool", 92: "St. Helens", 93: "Sefton", 95: "Wirral",
                100: "Bolton", 101: "Bury", 102: "Manchester", 104: "Oldham",
                106: "Rochdale", 107: "Salford", 109: "Stockport",
                110: "Tameside", 112: "Trafford", 114: "Wigan", 120: "Chester",
                121: "Congleton", 122: "Crewe and Nantwich",
                123: "Ellesmere Port and Neston", 124: "Halton",
                126: "Macclesfield", 127: "Vale Royal", 128: "Warrington",
                129: "Cheshire East", 130: "Cheshire West and Chester",
                139: "Northumberland", 140: "Alnwick",
                141: "Berwick-upon-Tweed", 142: "Blyth Valley",
                143: "Castle Morpeth", 144: "Tynedale", 145: "Wansbeck",
                146: "Gateshead", 147: "Newcastle upon Tyne",
                148: "North Tyneside", 149: "South Tyneside", 150: "Sunderland",
                160: "Chester-le-Street", 161: "Darlington", 162: "Derwentside",
                163: "Durham", 164: "Easington", 165: "Sedgefield",
                166: "Teesdale", 168: "Wear Valley", 169: "County Durham",
                180: "Craven", 181: "Hambleton", 182: "Harrogate",
                184: "Richmondshire", 185: "Ryedale", 186: "Scarborough",
                187: "Selby", 189: "York", 200: "Bradford", 202: "Calderdale",
                203: "Kirklees", 204: "Leeds", 206: "Wakefield",
                210: "Barnsley", 211: "Doncaster", 213: "Rotherham",
                215: "Sheffield", 228: "Kingston upon Hull, City of",
                231: "East Riding of Yorkshire", 232: "North Lincolnshire",
                233: "North East Lincolnshire", 240: "Hartlepool",
                241: "Redcar and Cleveland", 243: "Middlesbrough",
                245: "Stockton-on-Tees", 250: "Cannock Chase",
                251: "East Staffordshire", 252: "Lichfield",
                253: "Newcastle-under-Lyme", 254: "South Staffordshire",
                255: "Stafford", 256: "Staffordshire Moorlands",
                257: "Stoke-on-Trent", 258: "Tamworth", 270: "Bromsgrove",
                273: "Malvern Hills", 274: "Redditch", 276: "Worcester",
                277: "Wychavon", 278: "Wyre Forest", 279: "Bridgnorth",
                280: "North Shropshire", 281: "Oswestry",
                282: "Shrewsbury and Atcham", 283: "South Shropshire",
                284: "Telford and Wrekin", 285: "Herefordshire, County of",
                286: "Shropshire", 290: "North Warwickshire",
                291: "Nuneaton and Bedworth", 292: "Rugby",
                293: "Stratford-upon-Avon", 294: "Warwick", 300: "Birmingham",
                302: "Coventry", 303: "Dudley", 305: "Sandwell",
                306: "Solihull", 307: "Walsall", 309: "Wolverhampton",
                320: "Amber Valley", 321: "Bolsover", 322: "Chesterfield",
                323: "Derby", 324: "Erewash", 325: "High Peak",
                327: "North East Derbyshire", 328: "South Derbyshire",
                329: "Derbyshire Dales", 340: "Ashfield", 341: "Bassetlaw",
                342: "Broxtowe", 343: "Gedling", 344: "Mansfield",
                345: "Newark and Sherwood", 346: "Nottingham",
                347: "Rushcliffe", 350: "Boston", 351: "East Lindsey",
                352: "Lincoln", 353: "North Kesteven", 354: "South Holland",
                355: "South Kesteven", 356: "West Lindsey", 360: "Blaby",
                361: "Hinckley and Bosworth", 362: "Charnwood",
                363: "Harborough", 364: "Leicester", 365: "Melton",
                366: "North West Leicestershire", 367: "Oadby and Wigston",
                368: "Rutland", 380: "Corby", 381: "Daventry",
                382: "East Northamptonshire", 383: "Kettering",
                384: "Northampton", 385: "South Northamptonshire",
                386: "Wellingborough", 390: "Cambridge",
                391: "East Cambridgeshire", 392: "Fenland",
                393: "Huntingdonshire", 394: "Peterborough",
                395: "South Cambridgeshire", 400: "Breckland", 401: "Broadland",
                402: "Great Yarmouth", 404: "Norwich", 405: "North Norfolk",
                406: "South Norfolk", 407: "King's Lynn and West Norfolk",
                410: "Babergh", 411: "Forest Heath", 412: "Ipswich",
                413: "Mid Suffolk", 414: "St. Edmundsbury",
                415: "Suffolk Coastal", 416: "Waveney", 420: "Bedford",
                421: "Luton", 422: "Mid Bedfordshire",
                423: "South Bedfordshire", 424: "Central Bedfordshire",
                430: "Broxbourne", 431: "Dacorum", 432: "East Hertfordshire",
                433: "North Hertfordshire", 434: "St. Albans", 435: "Stevenage",
                436: "Three Rivers", 437: "Watford", 438: "Welwyn Hatfield",
                450: "Basildon", 451: "Braintree", 452: "Brentwood",
                453: "Castle Point", 454: "Chelmsford", 455: "Colchester",
                456: "Epping Forest", 457: "Harlow", 458: "Maldon",
                459: "Rochford", 460: "Southend-on-Sea", 461: "Tendring",
                462: "Thurrock", 463: "Uttlesford", 470: "Bracknell Forest",
                471: "West Berkshire", 472: "Reading", 473: "Slough",
                474: "Windsor and Maidenhead", 475: "Wokingham",
                476: "Aylesbury Vale", 477: "South Bucks", 478: "Chiltern",
                479: "Milton Keynes", 480: "Wycombe", 481: "Cherwell",
                482: "Oxford", 483: "Vale of White Horse",
                484: "South Oxfordshire", 485: "West Oxfordshire",
                490: "Basingstoke and Deane", 491: "Eastleigh", 492: "Fareham",
                493: "Gosport", 494: "Hart", 495: "Havant", 496: "New Forest",
                497: "East Hampshire", 498: "Portsmouth", 499: "Rushmoor",
                500: "Southampton", 501: "Test Valley", 502: "Winchester",
                505: "Isle of Wight", 510: "Elmbridge", 511: "Guildford",
                512: "Mole Valley", 513: "Reigate and Banstead",
                514: "Runnymede", 515: "Surrey Heath", 516: "Tandridge",
                517: "Waverley", 518: "Woking", 530: "Ashford",
                531: "Canterbury", 532: "Dartford", 533: "Dover",
                535: "Gravesham", 536: "Maidstone", 538: "Sevenoaks",
                539: "Shepway", 540: "Swale", 541: "Thanet",
                542: "Tonbridge and Malling", 543: "Tunbridge Wells",
                544: "Medway", 551: "Eastbourne", 552: "Hastings", 554: "Lewes",
                555: "Rother", 556: "Wealden", 557: "Adur", 558: "Arun",
                559: "Chichester", 560: "Crawley", 562: "Horsham",
                563: "Mid Sussex", 564: "Worthing", 565: "Brighton and Hove",
                570: "City of London", 580: "East Devon", 581: "Exeter",
                582: "North Devon", 583: "Plymouth", 584: "South Hams",
                585: "Teignbridge", 586: "Mid Devon", 587: "Torbay",
                588: "Torridge", 589: "West Devon", 590: "Caradon",
                591: "Carrick", 592: "Kerrier", 593: "North Cornwall",
                594: "Penwith", 595: "Restormel", 596: "Cornwall",
                601: "Bristol, City of", 605: "North Somerset", 606: "Mendip",
                607: "Sedgemoor", 608: "Taunton Deane", 609: "West Somerset",
                610: "South Somerset", 611: "Bath and North East Somerset",
                612: "South Gloucestershire", 620: "Cheltenham",
                621: "Cotswold", 622: "Forest of Dean", 623: "Gloucester",
                624: "Stroud", 625: "Tewkesbury", 630: "Kennet",
                631: "North Wiltshire", 632: "Salisbury", 633: "Swindon",
                634: "West Wiltshire", 635: "Wiltshire", 640: "Bournemouth",
                641: "Christchurch", 642: "North Dorset", 643: "Poole",
                644: "Purbeck", 645: "West Dorset",
                646: "Weymouth and Portland", 647: "East Dorset",
                720: "Isle of Anglesey", 721: "Conwy", 722: "Gwynedd",
                723: "Denbighshire", 724: "Flintshire", 725: "Wrexham",
                730: "Blaenau Gwent", 731: "Caerphilly", 732: "Monmouthshire",
                733: "Newport", 734: "Torfaen", 740: "Bridgend", 741: "Cardiff",
                742: "Merthyr Tydfil", 743: "Neath Port Talbot",
                744: "Rhondda, Cynon, Taff", 745: "Swansea",
                746: "The Vale of Glamorgan", 750: "Ceredigion",
                751: "Carmarthenshire", 752: "Pembrokeshire", 753: "Powys",
                910: "Aberdeen City", 911: "Aberdeenshire", 912: "Angus",
                913: "Argyll and Bute", 914: "Scottish Borders",
                915: "Clackmannanshire", 916: "West Dunbartonshire",
                917: "Dumfries and Galloway", 918: "Dundee City",
                919: "East Ayrshire", 920: "East Dunbartonshire",
                921: "East Lothian", 922: "East Renfrewshire",
                923: "Edinburgh, City of", 924: "Falkirk", 925: "Fife",
                926: "Glasgow City", 927: "Highland", 928: "Inverclyde",
                929: "Midlothian", 930: "Moray", 931: "North Ayrshire",
                932: "North Lanarkshire", 933: "Orkney Islands",
                934: "Perth and Kinross", 935: "Renfrewshire",
                936: "Shetland Islands", 937: "South Ayrshire",
                938: "South Lanarkshire", 939: "Stirling", 940: "West Lothian",
                941: "Western Isles"}),
    Feature('1st_Road_Number', float, name_extended='first road number',
            value_mapping={**{x: str(x) for x in range(10000)},
                           0: 'no official number', -1: 'unknown'}),
    Feature('2nd_Road_Number', float, name_extended='second roat number',
            value_mapping={**{x: str(x) for x in range(10000)},
                           0: 'no official number', -1: 'unknown'}),
    Feature('Urban_or_Rural_Area', int,
            name_extended='urban or rural area indicator',
            value_mapping={
                1: 'Urban',
                2: 'Rural',
                3: 'Unallocated',
                -1: 'Data missing or out of range',
            }),
    Feature('Vehicle_Reference_df', int, name_extended='vehicle reference DF'),
    Feature('Casualty_Reference', int, name_extended='casualty reference'),
    Feature('Sex_of_Casualty', int, name_extended='sex of casualty',
            value_mapping={
                1: 'Male',
                2: 'Female',
                9: 'unknown (self reported)',
                -1: 'Data missing or out of range'}),
    Feature('Age_of_Casualty', int, name_extended='age of casualty in years'),
    Feature('Age_Band_of_Casualty', int, name_extended='age band of casualty',
            value_mapping={
                1: '0 - 5', 2: '6 - 10', 3: '11 - 15', 4: '16 - 20',
                5: '21 - 25', 6: '26 - 35', 7: '36 - 45', 8: '46 - 55',
                9: '56 - 65', 10: '66 - 75', 11: 'Over 75'}),
    Feature('Pedestrian_Location', int, name_extended='pedestrian location',
            value_mapping={
                0: "Not a Pedestrian",
                1: "Crossing on pedestrian crossing facility",
                2: "Crossing in zig-zag approach lines",
                3: "Crossing in zig-zag exit lines",
                4: "Crossing elsewhere within 50m. of pedestrian crossing",
                5: "In carriageway, crossing elsewhere",
                6: "On footway or verge",
                7: "On refuge, central island or central reservation",
                8: "In centre of carriageway - not on refuge, island or central reservation",
                9: "In carriageway, not crossing", 10: "Unknown or other",
                -1: "Data missing or out of range"
            }),
    Feature('Pedestrian_Movement', int, name_extended='pedestrian movement',
            value_mapping={
                0: "Not a Pedestrian", 1: "Crossing from driver's nearside",
                2: "Crossing from nearside - masked by parked or stationary vehicle",
                3: "Crossing from driver's offside",
                4: "Crossing from offside - masked by  parked or stationary vehicle",
                5: "In carriageway, stationary - not crossing  (standing or playing)",
                6: "In carriageway, stationary - not crossing  (standing or playing) - masked by parked or stationary vehicle",
                7: "Walking along in carriageway, facing traffic",
                8: "Walking along in carriageway, back to traffic",
                9: "Unknown or other", -1: "Data missing or out of range"
            }),
    Feature('Casualty_Type', int, name_extended='casualty type',
            value_mapping={
                0: "Pedestrian", 1: "Cyclist",
                2: "Motorcycle 50cc and under rider or passenger",
                3: "Motorcycle 125cc and under rider or passenger",
                4: "Motorcycle over 125cc and up to 500cc rider or  passenger",
                5: "Motorcycle over 500cc rider or passenger",
                8: "Taxi/Private hire car occupant", 9: "Car occupant",
                10: "Minibus (8 - 16 passenger seats) occupant",
                11: "Bus or coach occupant (17 or more pass seats)",
                16: "Horse rider", 17: "Agricultural vehicle occupant",
                18: "Tram occupant",
                19: "Van / Goods vehicle (3.5 tonnes mgw or under) occupant",
                20: "Goods vehicle (over 3.5t. and under 7.5t.) occupant",
                21: "Goods vehicle (7.5 tonnes mgw and over) occupant",
                22: "Mobility scooter rider",
                23: "Electric motorcycle rider or passenger",
                90: "Other vehicle occupant",
                97: "Motorcycle - unknown cc rider or passenger",
                98: "Goods vehicle (unknown weight) occupant",
                99: "Unknown vehicle type (self rep only)",
                103: "Motorcycle - Scooter (1979-1998)",
                104: "Motorcycle (1979-1998)",
                105: "Motorcycle - Combination (1979-1998)",
                106: "Motorcycle over 125cc (1999-2004)",
                108: "Taxi (excluding private hire cars) (1979-2004)",
                109: "Car (including private hire cars) (1979-2004)",
                110: "Minibus/Motor caravan (1979-1998)",
                113: "Goods over 3.5 tonnes (1979-1998)", }),
    Feature('Casualty_IMD_Decile', int, name_extended='casualty IMD decile',
            value_mapping={
                1: "Most deprived 10%", 2: "More deprived 10-20%",
                3: "More deprived 20-30%", 4: "More deprived 30-40%",
                5: "More deprived 40-50%", 6: "Less deprived 40-50%",
                7: "Less deprived 30-40%", 8: "Less deprived 20-30%",
                9: "Less deprived 10-20%", 10: "Least deprived 10%",
                -1: "Data missing or out of range", }),
    Feature('SexofDriver', int, is_target=True,
            name_extended='driver is male'),
], documentation='https://www.openml.org/d/45038 ,'
                 'https://www.data.gov.uk/dataset/cb7ae6f0-4be6-4935-9277-47e5ce24a11f/road-safety-data')

POL_FEATURES = FeatureList(features=[
    Feature('binaryClass', int, is_target=True,
            name_extended="target value is below mean"),
    Feature('f5', int),
    Feature('f6', int),
    Feature('f7', int),
    Feature('f8', int),
    Feature('f9', int),
    Feature('f13', int),
    Feature('f14', int),
    Feature('f15', int),
    Feature('f16', int),
    Feature('f17', int),
    Feature('f18', int),
    Feature('f19', int),
    Feature('f20', int),
    Feature('f21', int),
    Feature('f22', int),
    Feature('f23', int),
    Feature('f24', int),
    Feature('f25', int),
    Feature('f26', int),
    Feature('f27', int),
    Feature('f28', int),
    Feature('f29', int),
    Feature('f30', int),
    Feature('f31', int),
    Feature('f32', int),
    Feature('f33', int),
], documentation='https://www.openml.org/d/44122')

JANNIS_FEATURES = FeatureList(features=[
    Feature('V1', float),
    Feature('V2', float),
    Feature('V3', float),
    Feature('V4', float),
    Feature('V5', float),
    Feature('V6', float),
    Feature('V7', float),
    Feature('V8', float),
    Feature('V9', float),
    Feature('V10', float),
    Feature('V11', float),
    Feature('V12', float),
    Feature('V13', float),
    Feature('V14', float),
    Feature('V15', float),
    Feature('V16', float),
    Feature('V17', float),
    Feature('V18', float),
    Feature('V19', float),
    Feature('V20', float),
    Feature('V21', float),
    Feature('V22', float),
    Feature('V23', float),
    Feature('V24', float),
    Feature('V25', float),
    Feature('V26', float),
    Feature('V27', float),
    Feature('V28', float),
    Feature('V29', float),
    Feature('V30', float),
    Feature('V31', float),
    Feature('V32', float),
    Feature('V33', float),
    Feature('V34', float),
    Feature('V35', float),
    Feature('V36', float),
    Feature('V37', float),
    Feature('V38', float),
    Feature('V39', float),
    Feature('V40', float),
    Feature('V41', float),
    Feature('V42', float),
    Feature('V43', float),
    Feature('V44', float),
    Feature('V45', float),
    Feature('V46', float),
    Feature('V47', float),
    Feature('V48', float),
    Feature('V49', float),
    Feature('V50', float),
    Feature('V51', float),
    Feature('V52', float),
    Feature('V53', float),
    Feature('V54', float),
    Feature('class', int, name_extended="class label", is_target=True),
], documentation="https://www.openml.org/d/45021"
                 "https://link.springer.com/chapter/10.1007/978-3-030-05318-5_10")

HOUSE_16H_FEATURES = FeatureList([
    Feature('P1', float),
    Feature('P5p1', float),
    Feature('P6p2', float),
    Feature('P11p4', float),
    Feature('P14p9', float),
    Feature('P15p1', float),
    Feature('P15p3', float),
    Feature('P16p2', float),
    Feature('P18p2', float),
    Feature('P27p4', float),
    Feature('H2p2', float),
    Feature('H8p2', float),
    Feature('H10p1', float),
    Feature('H13p1', float),
    Feature('H18pA', float),
    Feature('H40p4', float),
    Feature('binaryClass', int, is_target=True, name_extended='class label'),
], documentation="https://www.openml.org/d/44123 , "
                 "https://www.openml.org/search?type=data&status=active&id=574&sort=runs")


def preprocess_binaryclass_np_label(df: pd.DataFrame) -> pd.DataFrame:
    df['binaryClass'] = (df['binaryClass'] == 'P').astype(int)
    return df


def preprocess_electricity(df: pd.DataFrame) -> pd.DataFrame:
    df['price_increase'] = (df['class'] == 'UP').astype(int)
    df.drop(columns=['class'], inplace=True)
    return df


def preprocess_bank(df: pd.DataFrame) -> pd.DataFrame:
    # Original target is coded as (1,2), but it is really a binary variable for
    # default, see
    # https://www.openml.org/search?type=data&sort=runs&id=44126&status=active
    df['default'] = df['Class'] - 1
    df.drop(columns=['Class'], inplace=True)
    return df


def preprocess_magic_telescope(df: pd.DataFrame) -> pd.DataFrame:
    df['is_signal'] = (df['class'] == 'g').astype(int)
    df.drop(columns=['class'], inplace=True)
    return df


def preprocess_miniboone(df: pd.DataFrame) -> pd.DataFrame:
    df['signal'] = df['signal'].astype(int)
    return df


def preprocess_covertype(df: pd.DataFrame) -> pd.DataFrame:
    # convert class values from 1/2 to 0/1
    df['class'] = df['class'] - 1
    return df


_PREPROCESS_FNS: Dict[str, Callable[[pd.DataFrame], pd.DataFrame]] = {
    "electricity": preprocess_electricity,
    "bank-marketing": preprocess_bank,
    "california": lambda x: x,
    'covertype': preprocess_covertype,
    'credit': lambda x: x,
    'default-of-credit-card-clients': lambda x: x,
    'eye_movements': lambda x: x,
    'Higgs': lambda x: x,
    'house_16H': preprocess_binaryclass_np_label,
    'jannis': lambda x: x,
    'MagicTelescope': preprocess_magic_telescope,
    'MiniBooNE': preprocess_miniboone,
    'pol': preprocess_binaryclass_np_label,
    'road-safety': lambda x: x,
}


def preprocess_grinsztain_datataset(df: pd.DataFrame, name) -> pd.DataFrame:
    """Helper function to return the correct preprocessor for a dataset."""
    preprocess_fn = _PREPROCESS_FNS[name]
    return preprocess_fn(df)
