import json

from sympy.generator import FormulaGenerator

formula = r'\forall a,b,c,d\in\mathbb{R}: (a+b\mathrm{i})*(c+d\mathrm{i}) = (ac - bd) + (ad + bc)\mathrm{i}'
formula = r'\frac{a}{c+d\mathrm{i}}'
formula = r'\forall a,b,c,d\in\mathbb{R}: \frac{a+b\mathrm{i}}{c+d\mathrm{i}} =  \frac{(ac + bd) + (bc - ad)i}{c^2 + d^2}'
#formula = r'a*(c+d\mathrm{i})'
formula = r'Simplify the following expression: $y = \dfrac{p^2 - 3p - 54}{p - 9} $  First factor the polynomial in the numerator. $ p^2 - 3p - 54 = (p - 9)(p + 6) $. So we can rewrite the expression as: $y = \dfrac{(p - 9)(p + 6)}{p - 9} $. We can divide the numerator and denominator by $(p - 9)$ on condition that $p \neq 9$. Therefore $y = p + 6; p \neq 9$.'
formula = r"Consider $f(x)=\dfrac{1}{\sigma(x)} = 1+e^{-x}$ .  Then, on the one hand, the chain rule gives $f'(x) = \frac{d}{dx} \biggl( \frac{1}{\sigma(x)} \biggr) = -\frac{\sigma'(x)}{\sigma(x)^2}$, and on the other hand, $f'(x) = \frac{d}{dx} \bigl( 1+e^{-x} \bigr) = -e^{-x} = 1-f(x) = 1 - \frac{1}{\sigma(x)} = \frac{\sigma(x)-1}{\sigma(x)}$. Equate the two expressions, and voilà! "
formula = r"In my AI textbook there is this paragraph, without any explanation. The sigmoid function is defined as follows: “$ \sigma (x) = \frac{1}{1+e^{-x}}$. This function is easy to differentiate because $\frac{d\sigma (x)}{d(x)} = \sigma (x)\cdot (1-\sigma(x))$.“ It has been a long time since I've taken differential equations, so could anyone tell me how they got from the first equation to the second?"
formula = r'1-1/\tau(x) = \frac{\tau(x) - 1}{\tau(x)}'
formula = r'p^2-3p-54=(p-9)(p+6)'


formula = r'In any right-angled triangle, where the legs have lengths $a$ and $b$, and the hypotenuse has length $c$, the equation $a^2 + b^2 = c^2$ holds.'
formula = r'A right-angled triangle has side lengths $a$, $b$, and $c$, where $c$ is the length of the hypotenuse. Then $a^2 + b^2 = c^2$.'

formula  = FormulaGenerator(formula)

from datasets import load_dataset
nmf = load_dataset('anonymized', split='all')
all_texts = set(nmf['formula'])

versions = {'formula': [], 'stats': []}
for version, stats in formula.generate_versions_iterator(max=600000, only_true_version=True, return_stats=True, initial_is_candidate=False):
    if version not in all_texts:
        print(len(versions['formula']), version)
        versions['formula'].append(version)
        stats['original_id'] = str(stats['original_id'])
        versions['stats'].append(json.dumps(stats))
        all_texts.add(version)

        if len(versions['formula']) > 30000:
            break


# write to csv file
import pandas as pd
df = pd.DataFrame(versions)
df.to_csv('../math-mutator/data/nmf_fix_pyth_thm.csv', index=False)

