from pymatgen.core import Structure
from tqdm import tqdm
import pandas as pd
from jarvis.core.atoms import Atoms
import warnings
import os

df = pd.read_csv("gnome_data/stable_materials_summary.csv")
materialiid = df['MaterialId'].tolist()

os.makedirs("pre_training/pretraining_data", exist_ok=True)

def load_structure(mat_id):
    try:
        structure = Structure.from_file(f"gnome_data/by_id/{mat_id}.CIF")
        if len(structure) < 2 or len(structure) > 40:
            return None  # Filter out too small or large structures
        return structure
    except Exception as e:
        print(f"Failed to load {mat_id}: {e}")
        return None
    
    
structures_ = p_map(load_structure, materialiid)
structures = [s for s in structures_ if s is not None]


lst_ = []
for i,s in enumerate(tqdm(structures)):
    d = {}
    d['ID']=f"pretraining_data/{i}.vasp"
    s.to(fmt="poscar", filename=f"pretraining_data/{i}.vasp")
    lst_.append(d)
    
    
    
df = pd.DataFrame(lst_)

df.to_csv("pre_training/id_prop.csv",index=False)