import pandas as pd
import re
from sklearn.metrics import mean_absolute_error

# 路径配置（可根据实际情况修改）
pred_csv_path = 'tier_3_label_results.csv'
label_csv_path = 'baseline/property.csv'

# 读取预测结果
pred_df = pd.read_csv(pred_csv_path)
# 从 file 字段中提取 material_id，例如从 'orthogonal_2dm-2994_supercell_16x16x1.cif' 提取出 '2dm-2994'
pred_df['material_id'] = pred_df['file'].apply(lambda x: re.search(r'2dm-\d+', x).group())

# 读取真实 label 文件
label_df = pd.read_csv(label_csv_path)

# 筛选 label_df 中与 prediction 对应的 material_id 项
matched_df = pd.merge(pred_df, label_df[['material_id', 'energy_per_atom']], on='material_id', how='inner', suffixes=('_pred', '_true'))

# 计算 MAE（mean absolute error）
mae = mean_absolute_error(matched_df['energy_per_atom_true'], matched_df['energy_per_atom_pred'])

# 输出结果
print(f"Matched samples: {len(matched_df)}")
print(f"MAE of energy_per_atom: {mae:.6f} eV/atom")
