from pathlib import Path
import pandas as pd
from collections import namedtuple
import pickle
import ast

Anpl = namedtuple("Anpl", ["task", "user", "anpl"])

def read_anpl_logs(path):
    anpl_logs = []
    for date_dir in path.iterdir():
        for user_dir in date_dir.iterdir():
            for task_dir in user_dir.iterdir():
                anpl_files = list(task_dir.glob("*.pkl"))
                if len(anpl_files) < 1:
                    continue
                anpl_file = anpl_files[0]
                with open(anpl_file, 'rb') as f:
                    anpl = pickle.load(f)
                task_id, _ = task_dir.name[4:].split("_")
                anpl_logs.append(Anpl(int(task_id), user_dir.name, anpl))
    return anpl_logs

def count_holes_num(anpl_logs):
    data = []
    for anpl_log in anpl_logs:
        anpl = anpl_log.anpl
        hole_num, fun_num = 0, 0
        for f in anpl.funs.values():
            fun_num += 1
            if not f.code_from_user and f.prompt_from_user:
                hole_num += 1
        data.append({"task": anpl_log.task, "hole": hole_num, "fun": fun_num})
    df = pd.DataFrame.from_records(data)
    df.to_csv("hole_num.csv", index=False)

import re

def clean_code(code_string):
    code_string = re.sub(r'\"(.*?)\"', '', code_string, flags=re.DOTALL)
    code_string = re.sub(r'\"\"\"(.*?)\"\"\"', '', code_string, flags=re.DOTALL)
    code_string = re.sub(r"'''(.*?)'''", '', code_string, flags=re.DOTALL)
    lines = code_string.split('\n')
    codes = [line for line in lines if line.strip()]
    return codes

def count_line_number(anpl_logs):
    data = []
    for anpl_log in anpl_logs:
        anpl = anpl_log.anpl
        anpl_lines = clean_code(anpl.to_python(for_user=True))
        python_lines = clean_code(anpl.to_python(for_user=False))
        data.append({"task": anpl_log.task, "anpl": len(anpl_lines) - 3, "python": len(python_lines) - 3})
    df = pd.DataFrame.from_records(data)
    print(df['anpl'].sum())
    print(df['python'].sum())
    df.to_csv("line_num.csv", index=False)

def contains_control_flow(node):
    """
    Check if the given Python AST node contains an if, while, or for statement.
    """
    for child in ast.iter_child_nodes(node):
        if isinstance(child, (ast.If, ast.While, ast.For)):
            return True
        elif contains_control_flow(child):  # Recursively check child nodes
            return True
    return False

def control_flow(anpl_logs):
    data = []
    for anpl_log in anpl_logs:
        anpl = anpl_log.anpl
        num = 0
        for f in anpl.funs.values():
            if f.code_from_user:
                if contains_control_flow(f.code):
                    num += 1    
        data.append({"task": anpl_log.task, "control_flow_num": num})
    df = pd.DataFrame.from_records(data)
    df.to_csv("control_flow.csv", index=False)


def dump_python(anpl_logs):
    py_dir = Path("./pythons")
    for log in anpl_logs:
        file_name = py_dir / f"{log.task}.py"
        with open(file_name, "w") as f:
            f.write(log.anpl.to_python(for_user=False))

if __name__ == "__main__":
    anpl_logs = read_anpl_logs(Path("../anpl_data"))
    count_holes_num(anpl_logs)
    count_line_number(anpl_logs)
    control_flow(anpl_logs)
    # dump_python(anpl_logs)
