"""

For more information on datasets and access in TableShift, see:
* https://tableshift.org/datasets.html
* https://github.com/mlfoundations/tableshift
"""
import numpy as np
from pandas import DataFrame

from tableshift.core.features import Feature, FeatureList, cat_dtype

ASSISTMENTS_FEATURES = FeatureList(features=[
    Feature('Average_confidence(BORED)', float),
    Feature('Average_confidence(CONCENTRATING)', float),
    Feature('Average_confidence(CONFUSED)', float),
    Feature('Average_confidence(FRUSTRATED)', float),
    Feature('attempt_count', int),
    Feature('hint_count', int),
    Feature('school_id', float),
    Feature('skill_id', float,
            value_mapping={
                54.0: "Rounding",
                279.0: "Multiplication and Division Integers",
                79.0: "Proportion",
                86.0: "Exponents",
                311.0: "Equation Solving Two or Fewer Steps",
                11.0: "Venn Diagram",
                280.0: "Addition and Subtraction Fractions",
                338.0: "Combining Like Terms",
                81.0: "Unit Rate",
                325.0: "Write Linear Equation from Graph",
                322.0: "Write Linear Equation from Ordered Pairs",
                13.0: "Median",
                312.0: "Equation Solving More Than Two Steps",
                75.0: "Square Root",
                336.0: "Finding y-intercept from Linear Equation",
                315.0: "Parallel and Perpendicular Lines",
                340.0: "Distributive Property",
                70.0: "Percent Of",
                67.0: "Multiplication Fractions",
                277.0: "Addition and Subtraction Integers",
                24.0: "Congruence",
                363.0: "Multiplying non Monomial Polynomials",
                309.0:      "Order of Operations +,-,/,* () positive reals",
                324.0: "Recognize Linear Pattern",
                333.0: "Finding Slope From Equation",
                25.0: "Complementary and Supplementary Angles",
                378.0: "Solving Systems of Linear Equations by Graphing",
                49.0: "Ordering Positive Decimals",
                110.0: "D.4.8-understanding-concept-of-probabilities",
                42.0: "Perimeter of a Polygon",
                368.0: "Solving for a variable",
                12.0: "Mean",
                103.0: "Point Plotting",
                34.0: "Unit Conversion Within a System",
                47.0: "Conversion of Fraction Decimals Percents",
                61.0: "Division Fractions",
                296.0: "Area Rectangle",
                82.0: "Scientific Notation",
                15.0: "Range",
                51.0: "Ordering Integers",
                27.0: "Pythagorean Theorem",
                307.0: "Volume Rectangular Prism",
                65.0: "Least Common Multiple",
                323.0: "Write Linear Equation from Situation",
                326.0: "Write Linear Equation from Slope and y-intercept",
                46.0: "Calculations with Similar Figures",
                278.0: "Addition and Subtraction Positive Decimals",
                299.0: "Surface Area Cylinder",
                8.0: "Scatter Plot",
                77.0: "Finding Percents",
                163.0: "Absolute Value",
                523.0: "Picking Expressions From Choices",
                310.0: "Order of Operations All",
                80.0: "Scale Factor",
                297.0: "Area Trapezoid",
                39.0: "Area Circle",
                301.0: "Surface Area Rectangular Prism",
                302.0: "Surface Area Sphere",
                53.0: "Ordering Real Numbers",
                166.0: "Algebraic Solving",
                21.0: "Interior Angles Figures with More than 3 Sides",
                92.0: "Pattern Finding ",
                95.0: "Substitution",
                17.0: "Probability of Two Distinct Events",
                288.0: "Properties and Classification Rectangular Prisms",
                334.0: "Finding Slope from Ordered Pairs",
                282.0: "Properties and Classification Polygons with 5 or more sides",
                281.0: "Properties and Classification Triangles",
                371.0: "Simplifying Expressions positive exponents",
                84.0: "Prime Number",
                580.0: "Exponents",
                48.0: "Equivalent Fractions",
                314.0: "Angles - Obtuse, Acute, and Right",
                1.0: "Box and Whisker",
                362.0: "Parts of a Polyomial, Terms, Coefficient, Monomial, Exponent, Variable",
                350.0: "Solving Systems of Linear Equations",
                4.0: "Histogram as Table or Graph",
                306.0: "Volume Pyramid",
                298.0: "Area Triangle",
                83.0: "Divisibility Rules",
                294.0: "Area Irregular Figure",
                588.0: "Equivalent Fractions",
                577.0: "Nets of 3D Objects",
                331.0: "Finding Slope From Situation",
                569.0: "Surface Area Rectangular Prism",
                22.0: "Interior Angles Triangle",
                36.0: "Unit Conversion Standard to Metric",
                346.0: "Polynomial Factors",
                317.0: "Greatest Common Factor",
                332.0: "Finding Slope from Graph",
                90.0: "Picking Equation and Inequality from Choices",
                303.0: "Volume Cylinder",
                173.0: "Choose an Equation from Given Information",
                308.0: "Volume Sphere",
                217.0: "Rate",
                356.0: "Quadratic Formula to Solve Quadratic Equation",
                40.0: "Circumference ",
                283.0: "Properties and Classification Quadrilaterals",
                343.0: "Midpoint",
                295.0: "Area Parallelogram",
                392.0: "Surface Area of Prism",
                50.0: "Ordering Fractions",
                41.0: "Definition Pi",
                5.0: "Number Line",
                287.0: "Properties and Classification Prism",
                18.0: "Probability of a Single Event",
                276.0: "Multiplication and Division Positive Decimals",
                63.0: "Estimation",
                292.0: "Rotations",
                26.0: "Angles on Parallel Lines Cut by a Transversal",
                572.0: "Circumference",
                575.0: "Concept Volume",
                58.0: "Addition Whole Numbers",
                85.0: "Absolute Value",
                186.0: "Histogram as Table or Graph",
                358.0: "Making a Table from an Equation",
                284.0: "Properties and Classification Circle",
                584.0: "Multiplication Proper Fractions",
                574.0: "Subtraction Mixed Fractions",
                576.0: "Area Rectangle",
                366.0: "Composition of Function Adding",
                305.0: "Volume Cone",
                354.0: "Factoring Polynomials Standard",
                304.0: "Volume Prism",
                319.0: "Prime Factor",
                591.0: "Subtraction Proper Fractions",
                582.0: "Multiplication Positive Decimals",
                587.0: "Least Common Multiple",
                16.0: "Counting Methods",
                32.0: "Nets of 3D Figures",
                88.0: "Inverse Relations",
                365.0: "Interpreting Coordinate Graphs ",
                106.0: "Graph Shape",
                33.0: "English and Metric Terminology",
                375.0: "Solving Inequalities",
                1641.0: "Finding Ratios",
                62.0: "Division Whole Numbers",
                7.0: "Sampling Techniques",
                290.0: "Reflection",
                203.0: "Percent Discount",
                339.0: "Bar Graph",
                240.0: "X-Y Graph Reading",
                69.0: "Multiplication Whole Numbers",
                222.0: "Solving Inequalities",
                316.0: "Expanded, Standard and Word Notation",
                74.0: "Subtraction Whole Numbers",
                321.0: "Computation with Real Numbers",
                78.0: "Percent Increase or Decrease",
                391.0: "Graphing Linear Equations",
                230.0: "Surface Area of 3D Objects",
                327.0: "Comparing and Identifying Slope/Rate of Change",
                293.0: "Translations",
                10.0: "Table",
                300.0: "Surface Area Pyramid",
                9.0: "Stem and Leaf Plot",
                613.0: "Addition Mixed Fractions",
                360.0: "Writine Expression from Diagrams",
                387.0: "Perimeter of an Irregular Figure",
                14.0: "Mode",
                64.0: "Fraction Of",
                202.0: "Pattern Finding",
                318.0: "Multiplication Division by Powers of 10",
                344.0: "Distance Formula",
                231.0: "Symbolization",
                223.0: "Solving System of Equation",
                226.0: "Substitution",
                204.0: "Percents",
                6.0: "Line Plot",
                214.0: "Quadratic Equation Solving",
                585.0: "Division Proper Fractions",
                578.0: "Area Parallelogram",
                181.0: "Exponents",
                581.0: "Circle Concept",
                570.0: "Area Trapezoid",
                233.0: "Transformation",
                571.0: "Area Circle",
                583.0: "Multiplication Mixed Fractions",
                579.0: "Area Triangle",
                589.0: "Addition Proper Fractions",
                393.0: "Associative Property",
                198.0: "Median",
                370.0: "Recognizing Equivalent Expressions",
                376.0: "Graphing Inequalities on a number line",
                388.0: "Common Multiple",
                595.0: "Area Circle",
                599.0: "Concept Volume",
                52.0: "Ordering Whole Numbers",
                601.0: "Area Parallelogram",
                596.0: "Circumference",
                359.0: "Commutative Property",
                172.0: "Calculation with + - * /",
                2.0: "Circle Graph",
                224.0: "Square Roots",
                586.0: "Greatest Common Factor",
                35.0: "Effect of Changing Dimensions of a Shape Prportionally",
                568.0: "Volume Rectangular Prism",
                573.0: "Division Mixed Fractions",
                192.0: "Line of Best-Fit",
                238.0: "Volume of 3D Objects",
                345.0: "Properties and Clasification of Pyramid",
                605.0: "Multiplication Positive Decimals",
                313.0: "Tree Diagrams, Lists for Counting",
                597.0: "Division Mixed Fractions",
                216.0: "Range",
                386.0: "Odd and Even Number",
                213.0: "Pythagorean Theorem",
                206.0: "Factoring Trinomials",
                289.0: "Line Symmetry",
                165.0: "Algebraic Simplification",
                610.0: "Least Common Multiple",
                390.0: "Graph Shape",
                389.0: "Common Factor",
                212.0: "Proportion",
                593.0: "Surface Area Rectangular Prism",
                609.0: "Greatest Common Factor",
                606.0: "Multiplication Mixed Fractions",
                598.0: "Subtraction Mixed Fractions",
                385.0: "Elapsed Time",
                590.0: "Addition Mixed Fractions",
                594.0: "Area Trapezoid",
                184.0: "Geometric Definitions",
                218.0: "Finding Ratios",
                197.0: "Mean-Median-Mode-Range Differentiation",
                600.0: "Area Rectangle",
                196.0: "Mean",
                361.0: "Recognizing Expressions or Equations from Diagrams",
                603.0: "Exponents",
                614.0: "Subtraction Proper Fractions",
                608.0: "Division Proper Fractions",
                602.0: "Area Triangle",
                220.0: "Similar Figures",
                355.0: "Solve Quadratic Equations Using Factoring",
                592.0: "Volume Rectangular Prism",
                43.0: "Reading a Ruler or Scale",
                320.0: "Equal As Balance Concept",
                209.0: "Properties of Numbers",
                607.0: "Multiplication Proper Fractions",
                174.0: "Circle Graph",
                604.0: "Circle Concept",
                348.0: "Recognize Quadratic Pattern",
                612.0: "Addition Proper Fractions",
                178.0: "Combinatorics",
                611.0: "Equivalent Fractions",
                291.0: "Rotational Symmetry",
                177.0: "Co-ordinate Points",
            }),
    Feature('problem_type', cat_dtype),
    Feature('bottom_hint', float,
            value_mapping={0.: 'False', 1: 'True'}),
    Feature('ms_first_response', int),
    Feature('tutor_mode', cat_dtype),
    Feature('position', int),
    Feature('type', cat_dtype),
    Feature('overlap_time', int),
    Feature('first_action', int,
            value_mapping={
                0: 'Answer',
                1: 'Hint',
                2: 'Scaffold'
            }),
    # There are other text features in this dataset that would be useful for
    # text-only models; i.e. answer_text
    Feature('correct', int, is_target=True)
],
    documentation="https://www.kaggle.com/datasets/nicolaswattiez/skillbuilder-data-2009-2010 ,"
                  "https://sites.google.com/site/assistmentsdata/datasets/2012-13-school-data-with-affect")


def preprocess_assistments(df: DataFrame) -> DataFrame:
    # keep only binary correct/incorrect
    df = df[np.isin(df.correct.values, (0, 1))]
    return df


SCHOOL_IDS = [1.0, 73.0, 76.0, 139.0, 397.0, 1357.0, 1404.0, 1411.0, 1645.0,
              1689.0, 1862.0, 1998.0, 2268.0, 2770.0, 4720.0, 4724.0, 5006.0,
              5018.0, 5040.0, 5092.0, 5126.0, 5159.0, 5459.0, 5479.0, 5555.0,
              5843.0, 5872.0, 5887.0, 5897.0, 5922.0, 6090.0, 6144.0, 6257.0,
              6987.0, 7288.0, 7301.0, 7335.0, 7495.0, 7572.0, 7591.0, 7603.0,
              7703.0, 7740.0, 7760.0, 7796.0, 7804.0, 7806.0, 7836.0, 7839.0,
              7884.0, 7905.0, 8483.0, 8653.0, 8784.0, 8889.0, 8905.0, 8936.0,
              9180.0, 9208.0, 9214.0, 9270.0, 9362.0, 9409.0, 9481.0, 9495.0,
              9537.0, 9587.0, 9627.0, 9713.0, 9793.0, 9941.0, 9948.0, 10134.0,
              10152.0, 10725.0, 11154.0, 11195.0, 11230.0, 11234.0, 11247.0,
              11249.0, 11252.0, 11274.0, 11281.0, 11296.0, 11313.0, 11334.0,
              11338.0, 11357.0, 11378.0, 11390.0, 11404.0, 11446.0, 11475.0,
              11482.0, 11484.0, 11513.0, 11539.0, 11553.0, 11562.0, 11576.0,
              11594.0, 11720.0, 11772.0, 11791.0, 11823.0, 11857.0, 11876.0,
              11889.0, 11894.0, 11904.0, 11906.0, 11915.0, 11918.0, 11924.0,
              11930.0, 11931.0, 11943.0, 11944.0, 11950.0, 11954.0, 11955.0,
              11967.0, 11975.0, 11976.0, 11977.0, 11986.0, 11987.0, 11989.0,
              11996.0, 12011.0, 12012.0, 12021.0, 12026.0, 12038.0, 12056.0,
              12068.0, 12069.0, 12076.0, 12084.0, 12085.0, 12089.0, 12091.0,
              12097.0, 12116.0, 12138.0, 12141.0, 12148.0, 12154.0, 12164.0,
              12175.0, 12197.0, 12200.0, 12205.0, 12208.0, 12215.0, 12217.0,
              12221.0, 12223.0, 12225.0, 12238.0, 12240.0, 12246.0, 12248.0,
              12252.0, 12256.0, 12268.0, 12272.0, 12273.0, 12279.0, 12334.0,
              12364.0, 12367.0, 12388.0, 12406.0, 12408.0, 12412.0, 12419.0,
              np.nan, 4732.0, 4780.0, 4812.0, 4829.0, 4838.0, 4986.0, 5046.0,
              5048.0, 5049.0, 5056.0, 5062.0, 5068.0, 5098.0, 5104.0, 5106.0,
              5109.0, 5116.0, 5117.0, 5125.0, 5177.0, 5197.0, 5255.0, 5260.0,
              5307.0, 5308.0, 5309.0, 5366.0, 5399.0, 5405.0, 5406.0, 5441.0,
              5444.0, 5445.0, 5446.0, 5449.0, 5450.0, 5451.0, 5497.0, 5536.0,
              5545.0, 5550.0, 5558.0, 5559.0, 5561.0, 5689.0, 5692.0, 5721.0,
              5750.0, 5751.0, 5754.0, 5757.0, 5758.0, 5759.0, 5775.0, 5790.0,
              5797.0, 5809.0, 5874.0, 5905.0, 5906.0, 5909.0, 5910.0, 5913.0,
              5917.0, 5978.0, 5994.0, 6004.0, 6012.0, 6023.0, 6042.0, 6063.0,
              6089.0, 6123.0, 6168.0, 6177.0, 6197.0, 6205.0, 6232.0, 6246.0,
              6260.0, 6351.0, 6380.0, 6390.0, 6443.0, 6504.0, 6546.0, 6550.0,
              6553.0, 6842.0, 6905.0, 6963.0, 6977.0, 6979.0, 6992.0, 6995.0,
              7007.0, 7148.0, 7184.0, 7200.0, 7206.0, 7212.0, 7213.0, 7227.0,
              7265.0, 7304.0, 7313.0, 7340.0, 7349.0, 7359.0, 7367.0, 7389.0,
              7445.0, 7450.0, 7466.0, 7526.0, 7561.0, 7594.0, 7596.0, 7604.0,
              7630.0, 7654.0, 7674.0, 7777.0, 7778.0, 7782.0, 7787.0, 7801.0,
              7840.0, 7856.0, 7857.0, 7900.0, 7929.0, 7930.0, 7992.0, 8003.0,
              8040.0, 8041.0, 8051.0, 8086.0, 8096.0, 8207.0, 8253.0, 8264.0,
              8265.0, 8272.0, 8332.0, 8359.0, 8411.0, 8423.0, 8430.0, 8471.0,
              8478.0, 8534.0, 8554.0, 8574.0, 8646.0, 8687.0, 8744.0, 8747.0,
              8771.0, 8772.0, 8781.0, 8806.0, 8828.0, 8900.0, 9100.0, 9112.0,
              9155.0, 9161.0, 9178.0, 9186.0, 9193.0, 9203.0, 9221.0, 9241.0,
              9252.0, 9253.0, 9313.0, 9314.0, 9343.0, 9354.0, 9376.0, 9377.0,
              9394.0, 9446.0, 9450.0, 9456.0, 9486.0, 9518.0, 9532.0, 9534.0,
              9548.0, 9596.0, 9603.0, 9605.0, 9609.0, 9620.0, 9625.0, 9638.0,
              9660.0, 9667.0, 9668.0, 9684.0, 9703.0, 9727.0, 9735.0, 9775.0,
              9840.0, 9880.0, 9884.0, 9903.0, 10120.0, 10180.0, 10241.0,
              10284.0, 10323.0, 10399.0, 10410.0, 10499.0, 10735.0, 10736.0,
              10766.0, 11073.0, 11079.0, 11142.0, 11169.0, 11171.0, 11178.0,
              11180.0, 11220.0, 11228.0, 11229.0, 11231.0, 11233.0, 11236.0,
              11248.0, 11254.0, 11255.0, 11261.0, 11262.0, 11263.0, 11264.0,
              11266.0, 11267.0, 11277.0, 11280.0, 11283.0, 11294.0, 11297.0,
              11302.0, 11312.0, 11315.0, 11317.0, 11318.0, 11320.0, 11321.0,
              11322.0, 11330.0, 11335.0, 11350.0, 11362.0, 11364.0, 11368.0,
              11371.0, 11372.0, 11384.0, 11385.0, 11393.0, 11394.0, 11397.0,
              11398.0, 11403.0, 11409.0, 11416.0, 11418.0, 11420.0, 11422.0,
              11443.0, 11449.0, 11451.0, 11456.0, 11463.0, 11469.0, 11479.0,
              11481.0, 11486.0, 11502.0, 11505.0, 11509.0, 11514.0, 11518.0,
              11519.0, 11529.0, 11531.0, 11540.0, 11542.0, 11546.0, 11555.0,
              11564.0, 11572.0, 11575.0, 11580.0, 11589.0, 11590.0, 11600.0,
              11604.0, 11609.0, 11633.0, 11679.0, 11696.0, 11742.0, 11746.0,
              11749.0, 11757.0, 11765.0, 11770.0, 11773.0, 11779.0, 11782.0,
              11788.0, 11803.0, 11808.0, 11811.0, 11813.0, 11864.0, 11868.0,
              11875.0, 11878.0, 11883.0, 11884.0, 11885.0, 11887.0, 11888.0,
              11890.0, 11891.0, 11895.0, 11896.0, 11901.0, 11903.0, 11905.0,
              11909.0, 11910.0, 11912.0, 11913.0, 11916.0, 11917.0, 11920.0,
              11925.0, 11929.0, 11933.0, 11935.0, 11936.0, 11938.0, 11939.0,
              11941.0, 11942.0, 11945.0, 11946.0, 11947.0, 11948.0, 11949.0,
              11951.0, 11956.0, 11959.0, 11960.0, 11961.0, 11962.0, 11963.0,
              11964.0, 11966.0, 11971.0, 11973.0, 11983.0, 11991.0, 11992.0,
              11993.0, 11995.0, 11998.0, 12003.0, 12005.0, 12007.0, 12008.0,
              12010.0, 12013.0, 12014.0, 12016.0, 12017.0, 12018.0, 12022.0,
              12024.0, 12029.0, 12033.0, 12041.0, 12044.0, 12055.0, 12057.0,
              12058.0, 12059.0, 12063.0, 12064.0, 12070.0, 12071.0, 12075.0,
              12077.0, 12079.0, 12083.0, 12086.0, 12087.0, 12096.0, 12098.0,
              12099.0, 12103.0, 12104.0, 12108.0, 12112.0, 12114.0, 12118.0,
              12121.0, 12122.0, 12126.0, 12131.0, 12134.0, 12135.0, 12140.0,
              12142.0, 12144.0, 12149.0, 12153.0, 12155.0, 12157.0, 12158.0,
              12159.0, 12161.0, 12162.0, 12163.0, 12169.0, 12174.0, 12176.0,
              12177.0, 12181.0, 12182.0, 12184.0, 12185.0, 12198.0, 12199.0,
              12201.0, 12202.0, 12203.0, 12204.0, 12207.0, 12209.0, 12211.0,
              12218.0, 12219.0, 12230.0, 12232.0, 12244.0, 12245.0, 12253.0,
              12270.0, 12293.0, 12294.0, 12302.0, 12303.0, 12311.0, 12320.0,
              12326.0, 12344.0, 12346.0, 12350.0, 12353.0, 12356.0, 12363.0,
              12373.0, 12379.0, 12383.0, 12384.0, 12391.0, 12403.0, 12416.0,
              12420.0, 12421.0, 12428.0]
