# ======================== PARAMS (edit here) ========================
INPUT_LOG_PATH = r""          # input log file path
OUTPUT_PATH    = r""          # output txt file path

PREFIX_EPOCH   = True         # prepend "[epoch=X]" for each line
WITH_COUNT     = True         # append frequency summary at the end
FIRST_PER_EPOCH_ONLY = True   # if True: keep only the first genotype per epoch
# ===================================================================

import re
from collections import OrderedDict, Counter
from typing import List, Tuple, Iterable, Optional

# Patterns to detect genotype lines and epoch hints
GENO_HINT_PAT = re.compile(r"genotype for current epoch:\s*(Genotype\(.+)", re.IGNORECASE)
GENO_RAW_PAT  = re.compile(r"^\s*(Genotype\(.+)", re.IGNORECASE)
EPOCH_PAT     = re.compile(r"epoch:\s*(\d+)", re.IGNORECASE)


def _iter_genotype_blocks(lines: Iterable[str]) -> Iterable[Tuple[Optional[int], str]]:

    capturing = False
    buf_parts: List[str] = []
    paren_balance = 0
    last_epoch: Optional[int] = None

    for line in lines:
        # track the most recent epoch occurrence
        m_ep = EPOCH_PAT.search(line)
        if m_ep:
            try:
                last_epoch = int(m_ep.group(1))
            except ValueError:
                pass

        if not capturing:
            # case 1: prefixed with a hint
            m = GENO_HINT_PAT.search(line)
            if m:
                part = m.group(1).strip()
                buf_parts = [part]
                paren_balance = part.count('(') - part.count(')')
                if paren_balance <= 0:
                    yield last_epoch, part
                    buf_parts = []
                    paren_balance = 0
                else:
                    capturing = True
                continue

            # case 2: starts directly with Genotype(
            m2 = GENO_RAW_PAT.search(line)
            if m2:
                part = m2.group(1).strip()
                buf_parts = [part]
                paren_balance = part.count('(') - part.count(')')
                if paren_balance <= 0:
                    yield last_epoch, part
                    buf_parts = []
                    paren_balance = 0
                else:
                    capturing = True
                continue

        else:
            # concatenate multi-line Genotype(...)
            s = line.strip()
            if s:
                buf_parts.append(s)
                paren_balance += s.count('(') - s.count(')')
                if paren_balance <= 0:
                    geno = ' '.join(buf_parts)
                    yield last_epoch, geno
                    buf_parts = []
                    paren_balance = 0
                    capturing = False

    # end-of-file fallback
    if capturing and buf_parts:
        geno = ' '.join(buf_parts)
        yield last_epoch, geno


def extract_unique_genotypes(
    path: str,
    first_per_epoch_only: bool = False
):
    """
    Read the log and extract unique genotypes (preserve first-appearance order).
    If first_per_epoch_only=True, keep only the first genotype encountered for each epoch.

    Returns:
      unique_items: list of tuples (epoch, genotype_str)
      counts: Counter with raw occurrence frequencies (before dedup)
    """
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    counts = Counter()
    seen = OrderedDict()  # key=genotype_str, val=(epoch, genotype_str)
    epoch_seen_flag = set()  # used when keeping the first genotype per epoch only

    for epoch, geno in _iter_genotype_blocks(lines):
        counts[geno] += 1

        if first_per_epoch_only and (epoch is not None):
            key = f"ep-{epoch}"
            if key in epoch_seen_flag:
                continue
            epoch_seen_flag.add(key)

        if geno not in seen:
            seen[geno] = (epoch, geno)

    unique_items = list(seen.values())
    return unique_items, counts


def run():
    unique_items, counts = extract_unique_genotypes(
        INPUT_LOG_PATH,
        first_per_epoch_only=FIRST_PER_EPOCH_ONLY
    )

    with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
        for idx, (ep, geno) in enumerate(unique_items, start=1):
            if PREFIX_EPOCH and ep is not None:
                prefix = f"[{idx}] [epoch={ep}] "
            else:
                prefix = f"[{idx}] "
            out.write(f"{prefix}{geno}\n")

        if WITH_COUNT:
            out.write("\n# Frequency (including duplicates):\n")
            for geno, c in counts.most_common():
                out.write(f"{c}  -  {geno}\n")

    print(f"Done: {len(unique_items)} unique genotypes written -> {OUTPUT_PATH}")


if __name__ == "__main__":
    run()
