import re

import argparse

parser = argparse.ArgumentParser(description='Process logs')
parser.add_argument('--name', type=str, default="582M_1.log", help='name of the log file')
args = parser.parse_args()

NAME = "firstmodel.log"

def replace_nonnumeric_blocks_with_comma(s):
    if "BEGINNING SELF-TRAINING" in s:
        return "BEGINNING SELF-TRAINING"
    # The regex pattern matches one or more non-digit characters
    pattern = re.compile(r'\D+')
    return pattern.sub(',', s)

def remove_all_duplicate_lines(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    seen = set()
    mark_begin_self_training = False
    new_lines = []

    for line in lines:
        if line not in seen and "Wandb" not in line:

            if not mark_begin_self_training and "digit decomp problems" in line:
                new_lines.append("BEGINNING SELF-TRAINING\n")
                mark_begin_self_training = True
            seen.add(line)
            new_lines.append(line)

    new_lines = extract_specific_lines(new_lines)
    labels = "num_digits,steps,batch_size,total_step\n"

    new_lines = [labels] + [line.lstrip(',').rstrip(',') + "\n" for line in new_lines]
    with open("processed_" + filename, 'w') as file:
        file.writelines(new_lines)

def extract_specific_lines(lines, keyword="Overall total steps"):
    return [replace_nonnumeric_blocks_with_comma(line) for line in lines if keyword in line or "BEGINNING SELF-TRAINING" in line]

if __name__ == "__main__":
    remove_all_duplicate_lines(args.name)
