import os
import re
import json
import numpy as np
from pylatexenc.latex2text import LatexNodes2Text
from sentence_transformers import SentenceTransformer

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

model = SentenceTransformer()
embeddings = []
file_refs = []

# 2. Read tex file content
def read_tex_file(tex_path):
    with open(tex_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()


# 3. Preprocess tex content, remove comments, imports, citations and extra whitespace
def preprocess_tex(tex_content):
    # Remove comments
    tex_content = re.sub(r'%.*', '', tex_content)
    # Remove import related commands (such as \usepackage, \bibliographystyle, \bibliography, \inputenc, \documentclass, etc.)
    tex_content = re.sub(r'\\(usepackage|bibliographystyle|bibliography|inputenc|documentclass|IEEEoverridecommandlockouts|makeatletter)[^\n]*', '', tex_content)
    # Remove \begin{thebibliography} and subsequent content
    tex_content = re.split(r'\\begin\{thebibliography\}', tex_content)[0]
    # Also remove \bibliography{...} and subsequent content (some documents use this directly)
    tex_content = re.split(r'\\bibliography\{', tex_content)[0]
    # Merge extra whitespace
    tex_content = re.sub(r'\s+', ' ', tex_content)
    return tex_content

for root, dirs, files in os.walk('single_tex'):
    for file in files:
        if file.endswith('.tex'):
            tex_path = os.path.join(root, file)
            tex_content = read_tex_file(tex_path)
            tex_content = preprocess_tex(tex_content)
            # Optional: roughly remove common problematic macros
            tex_content = re.sub(r'\\\\(href|url|cite|ref|footnote)[^\\s]*\\{[^\\}]*\\}', '', tex_content)
            try:
                plain_text = LatexNodes2Text().latex_to_text(tex_content)
            except Exception as e:
                print(f"[WARN] {tex_path} text conversion failed: {e}")
                continue
            emb = model.encode(plain_text, normalize_embeddings=True)
            embeddings.append(emb)
            file_refs.append(tex_path)

embeddings = np.vstack(embeddings)
np.save('article_embeddings.npy', embeddings)
with open('article_refs.json', 'w', encoding='utf-8') as f:
    json.dump(file_refs, f, ensure_ascii=False, indent=2)