import os
import re
import sys

def extract_dataset_sizes(line):
    """Extracts dataset sizes from the given line using regular expressions."""
    match = re.search(r'size of dataset: (\d+)size of correct dataset: (\d+)', line)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def extract_languages(filename):
    """Extracts source, latent, and target languages from the filename."""
    match = re.match(r'.*([a-z]{2})_([a-z]+)_([a-z]{2})(?:_\d{2})?\.log', filename)
    if match:
        return match.group(1), match.group(2), match.group(3)
    return None, None, None

def process_folder(folder_path):
    """Processes each .log file in the given folder to extract and print dataset sizes in LaTeX table format."""
    table_entries = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.log'):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    for line in file:
                        size_of_dataset, size_of_correct_dataset = extract_dataset_sizes(line)
                        if size_of_dataset is not None and size_of_correct_dataset is not None:
                            source_lang, latent_lang, target_lang = extract_languages(filename)
                            if source_lang and latent_lang and target_lang:
                                key = (source_lang, latent_lang, target_lang)
                                if key not in table_entries:
                                    table_entries[key] = (size_of_dataset, size_of_correct_dataset)
                            break

    print_latex_table(table_entries)

def print_latex_table(entries):
    """Prints the LaTeX code for a table with the given entries."""
    print(r"""
\begin{table}[t]
\caption{Dataset sizes.}
\label{dataset-sizes-table}
\vskip 0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lcccc}
\toprule
Source Language & Latent Language & Target Language & Size of Dataset & Size of Correct Dataset \\
\midrule
""")

    for (source_lang, latent_lang, target_lang), (size_of_dataset, size_of_correct_dataset) in entries.items():
        print(f"{source_lang} & {latent_lang} & {target_lang} & {size_of_dataset} & {size_of_correct_dataset} \\\\")

    print(r"""
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\end{table}
""")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python extract_dataset_sizes_latex.py <folder_path>")
        sys.exit(1)

    folder_path = sys.argv[1]
    process_folder(folder_path)
