model,question,room_type,score
gpt-5-2025-08-07,Pair Distance,kitchens,99.83
gpt-5-2025-08-07,Pair Distance,living_rooms,98.83
gpt-5-2025-08-07,Pair Distance,bedrooms,98.33
gpt-5-2025-08-07,Pair Distance,hssd_simple,69.0
gpt-5-2025-08-07,Placement,kitchens,84.67
gpt-5-2025-08-07,Placement,living_rooms,75.5
gpt-5-2025-08-07,Placement,bedrooms,61.17
gpt-5-2025-08-07,Placement,hssd_simple,70.0
gpt-5-2025-08-07,Repositioning,kitchens,83.0
gpt-5-2025-08-07,Repositioning,living_rooms,85.5
gpt-5-2025-08-07,Repositioning,bedrooms,77.83
gpt-5-2025-08-07,Repositioning,hssd_simple,49.5
gpt-5-2025-08-07,Free Space,kitchens,82.5
gpt-5-2025-08-07,Free Space,living_rooms,47.0
gpt-5-2025-08-07,Free Space,bedrooms,50.5
gpt-5-2025-08-07,Free Space,hssd_simple,19.5
gpt-5-2025-08-07,Visibility,kitchens,94.83
gpt-5-2025-08-07,Visibility,living_rooms,95.17
gpt-5-2025-08-07,Visibility,bedrooms,94.17
gpt-5-2025-08-07,Visibility,hssd_simple,57.0
gpt-5-2025-08-07,View Angle,kitchens,96.17
gpt-5-2025-08-07,View Angle,living_rooms,93.33
gpt-5-2025-08-07,View Angle,bedrooms,95.17
gpt-5-2025-08-07,View Angle,hssd_simple,59.5
gpt-5-2025-08-07,Max Box,kitchens,48.5
gpt-5-2025-08-07,Max Box,living_rooms,17.33
gpt-5-2025-08-07,Max Box,bedrooms,13.0
gpt-5-2025-08-07,Max Box,hssd_simple,5.0
gpt-5-2025-08-07,Shortest Path (Valid),kitchens,64.67
gpt-5-2025-08-07,Shortest Path (Valid),living_rooms,21.17
gpt-5-2025-08-07,Shortest Path (Valid),bedrooms,58.17
gpt-5-2025-08-07,Shortest Path (Valid),hssd_simple,28.5
gpt-5-2025-08-07,Shortest Path (Frechet),kitchens,56.33
gpt-5-2025-08-07,Shortest Path (Frechet),living_rooms,12.33
gpt-5-2025-08-07,Shortest Path (Frechet),bedrooms,39.33
gpt-5-2025-08-07,Shortest Path (Frechet),hssd_simple,22.5
gpt-oss-120b,Pair Distance,kitchens,98.0
gpt-oss-120b,Pair Distance,living_rooms,99.33
gpt-oss-120b,Pair Distance,bedrooms,99.5
gpt-oss-120b,Pair Distance,hssd_simple,78.5
gpt-oss-120b,Placement,kitchens,92.0
gpt-oss-120b,Placement,living_rooms,89.0
gpt-oss-120b,Placement,bedrooms,83.5
gpt-oss-120b,Placement,hssd_simple,85.0
gpt-oss-120b,Repositioning,kitchens,85.5
gpt-oss-120b,Repositioning,living_rooms,89.83
gpt-oss-120b,Repositioning,bedrooms,83.33
gpt-oss-120b,Repositioning,hssd_simple,60.5
gpt-oss-120b,Free Space,kitchens,99.0
gpt-oss-120b,Free Space,living_rooms,83.33
gpt-oss-120b,Free Space,bedrooms,87.5
gpt-oss-120b,Free Space,hssd_simple,31.0
gpt-oss-120b,Visibility,kitchens,94.17
gpt-oss-120b,Visibility,living_rooms,94.0
gpt-oss-120b,Visibility,bedrooms,92.5
gpt-oss-120b,Visibility,hssd_simple,70.0
gpt-oss-120b,View Angle,kitchens,98.5
gpt-oss-120b,View Angle,living_rooms,97.33
gpt-oss-120b,View Angle,bedrooms,98.17
gpt-oss-120b,View Angle,hssd_simple,74.0
gpt-oss-120b,Max Box,kitchens,62.83
gpt-oss-120b,Max Box,living_rooms,28.17
gpt-oss-120b,Max Box,bedrooms,30.33
gpt-oss-120b,Max Box,hssd_simple,9.5
gpt-oss-120b,Shortest Path (Valid),kitchens,64.17
gpt-oss-120b,Shortest Path (Valid),living_rooms,66.0
gpt-oss-120b,Shortest Path (Valid),bedrooms,57.83
gpt-oss-120b,Shortest Path (Valid),hssd_simple,33.5
gpt-oss-120b,Shortest Path (Frechet),kitchens,55.5
gpt-oss-120b,Shortest Path (Frechet),living_rooms,40.5
gpt-oss-120b,Shortest Path (Frechet),bedrooms,44.83
gpt-oss-120b,Shortest Path (Frechet),hssd_simple,26.5
DeepSeek-R1-0528,Pair Distance,kitchens,98.0
DeepSeek-R1-0528,Pair Distance,living_rooms,99.0
DeepSeek-R1-0528,Pair Distance,bedrooms,96.83
DeepSeek-R1-0528,Pair Distance,hssd_simple,25.5
DeepSeek-R1-0528,Placement,kitchens,89.0
DeepSeek-R1-0528,Placement,living_rooms,82.17
DeepSeek-R1-0528,Placement,bedrooms,72.0
DeepSeek-R1-0528,Placement,hssd_simple,79.0
DeepSeek-R1-0528,Repositioning,kitchens,79.17
DeepSeek-R1-0528,Repositioning,living_rooms,86.17
DeepSeek-R1-0528,Repositioning,bedrooms,83.33
DeepSeek-R1-0528,Repositioning,hssd_simple,47.5
DeepSeek-R1-0528,Free Space,kitchens,93.0
DeepSeek-R1-0528,Free Space,living_rooms,18.33
DeepSeek-R1-0528,Free Space,bedrooms,34.83
DeepSeek-R1-0528,Free Space,hssd_simple,6.5
DeepSeek-R1-0528,Visibility,kitchens,71.33
DeepSeek-R1-0528,Visibility,living_rooms,52.0
DeepSeek-R1-0528,Visibility,bedrooms,53.5
DeepSeek-R1-0528,Visibility,hssd_simple,10.0
DeepSeek-R1-0528,View Angle,kitchens,73.67
DeepSeek-R1-0528,View Angle,living_rooms,68.17
DeepSeek-R1-0528,View Angle,bedrooms,75.17
DeepSeek-R1-0528,View Angle,hssd_simple,13.5
DeepSeek-R1-0528,Max Box,kitchens,50.5
DeepSeek-R1-0528,Max Box,living_rooms,8.0
DeepSeek-R1-0528,Max Box,bedrooms,11.0
DeepSeek-R1-0528,Max Box,hssd_simple,2.5
DeepSeek-R1-0528,Shortest Path (Valid),kitchens,12.33
DeepSeek-R1-0528,Shortest Path (Valid),living_rooms,12.5
DeepSeek-R1-0528,Shortest Path (Valid),bedrooms,7.83
DeepSeek-R1-0528,Shortest Path (Valid),hssd_simple,8.5
DeepSeek-R1-0528,Shortest Path (Frechet),kitchens,11.5
DeepSeek-R1-0528,Shortest Path (Frechet),living_rooms,9.33
DeepSeek-R1-0528,Shortest Path (Frechet),bedrooms,6.0
DeepSeek-R1-0528,Shortest Path (Frechet),hssd_simple,2.5
gpt-5-mini-2025-08-07,Pair Distance,kitchens,100.0
gpt-5-mini-2025-08-07,Pair Distance,living_rooms,99.67
gpt-5-mini-2025-08-07,Pair Distance,bedrooms,99.67
gpt-5-mini-2025-08-07,Pair Distance,hssd_simple,32.5
gpt-5-mini-2025-08-07,Placement,kitchens,90.83
gpt-5-mini-2025-08-07,Placement,living_rooms,86.33
gpt-5-mini-2025-08-07,Placement,bedrooms,81.17
gpt-5-mini-2025-08-07,Placement,hssd_simple,75.5
gpt-5-mini-2025-08-07,Repositioning,kitchens,84.5
gpt-5-mini-2025-08-07,Repositioning,living_rooms,92.83
gpt-5-mini-2025-08-07,Repositioning,bedrooms,84.33
gpt-5-mini-2025-08-07,Repositioning,hssd_simple,53.5
gpt-5-mini-2025-08-07,Free Space,kitchens,99.5
gpt-5-mini-2025-08-07,Free Space,living_rooms,78.5
gpt-5-mini-2025-08-07,Free Space,bedrooms,82.17
gpt-5-mini-2025-08-07,Free Space,hssd_simple,5.0
gpt-5-mini-2025-08-07,Visibility,kitchens,98.0
gpt-5-mini-2025-08-07,Visibility,living_rooms,98.0
gpt-5-mini-2025-08-07,Visibility,bedrooms,95.5
gpt-5-mini-2025-08-07,Visibility,hssd_simple,39.0
gpt-5-mini-2025-08-07,View Angle,kitchens,88.33
gpt-5-mini-2025-08-07,View Angle,living_rooms,84.5
gpt-5-mini-2025-08-07,View Angle,bedrooms,86.83
gpt-5-mini-2025-08-07,View Angle,hssd_simple,25.5
gpt-5-mini-2025-08-07,Max Box,kitchens,85.17
gpt-5-mini-2025-08-07,Max Box,living_rooms,60.33
gpt-5-mini-2025-08-07,Max Box,bedrooms,61.17
gpt-5-mini-2025-08-07,Max Box,hssd_simple,17.0
gpt-5-mini-2025-08-07,Shortest Path (Valid),kitchens,52.33
gpt-5-mini-2025-08-07,Shortest Path (Valid),living_rooms,53.67
gpt-5-mini-2025-08-07,Shortest Path (Valid),bedrooms,52.0
gpt-5-mini-2025-08-07,Shortest Path (Valid),hssd_simple,16.5
gpt-5-mini-2025-08-07,Shortest Path (Frechet),kitchens,50.5
gpt-5-mini-2025-08-07,Shortest Path (Frechet),living_rooms,47.5
gpt-5-mini-2025-08-07,Shortest Path (Frechet),bedrooms,47.67
gpt-5-mini-2025-08-07,Shortest Path (Frechet),hssd_simple,12.0
Gemini Flash 2.5,Pair Distance,kitchens,96.33
Gemini Flash 2.5,Pair Distance,living_rooms,96.0
Gemini Flash 2.5,Pair Distance,bedrooms,95.5
Gemini Flash 2.5,Pair Distance,hssd_simple,12.5
Gemini Flash 2.5,Placement,kitchens,59.67
Gemini Flash 2.5,Placement,living_rooms,53.33
Gemini Flash 2.5,Placement,bedrooms,35.83
Gemini Flash 2.5,Placement,hssd_simple,16.0
Gemini Flash 2.5,Repositioning,kitchens,90.5
Gemini Flash 2.5,Repositioning,living_rooms,91.17
Gemini Flash 2.5,Repositioning,bedrooms,85.17
Gemini Flash 2.5,Repositioning,hssd_simple,18.5
Gemini Flash 2.5,Free Space,kitchens,93.33
Gemini Flash 2.5,Free Space,living_rooms,17.67
Gemini Flash 2.5,Free Space,bedrooms,33.33
Gemini Flash 2.5,Free Space,hssd_simple,1.0
Gemini Flash 2.5,Visibility,kitchens,26.83
Gemini Flash 2.5,Visibility,living_rooms,11.33
Gemini Flash 2.5,Visibility,bedrooms,11.17
Gemini Flash 2.5,Visibility,hssd_simple,0.5
Gemini Flash 2.5,View Angle,kitchens,92.5
Gemini Flash 2.5,View Angle,living_rooms,91.5
Gemini Flash 2.5,View Angle,bedrooms,93.83
Gemini Flash 2.5,View Angle,hssd_simple,20.0
Gemini Flash 2.5,Max Box,kitchens,3.67
Gemini Flash 2.5,Max Box,living_rooms,0.0
Gemini Flash 2.5,Max Box,bedrooms,0.0
Gemini Flash 2.5,Max Box,hssd_simple,0.0
Gemini Flash 2.5,Shortest Path (Valid),kitchens,3.17
Gemini Flash 2.5,Shortest Path (Valid),living_rooms,1.5
Gemini Flash 2.5,Shortest Path (Valid),bedrooms,1.0
Gemini Flash 2.5,Shortest Path (Valid),hssd_simple,0.0
Gemini Flash 2.5,Shortest Path (Frechet),kitchens,3.17
Gemini Flash 2.5,Shortest Path (Frechet),living_rooms,1.33
Gemini Flash 2.5,Shortest Path (Frechet),bedrooms,0.83
Gemini Flash 2.5,Shortest Path (Frechet),hssd_simple,0.0
gpt-oss-20b,Pair Distance,kitchens,94.17
gpt-oss-20b,Pair Distance,living_rooms,93.5
gpt-oss-20b,Pair Distance,bedrooms,93.83
gpt-oss-20b,Pair Distance,hssd_simple,40.5
gpt-oss-20b,Placement,kitchens,85.67
gpt-oss-20b,Placement,living_rooms,78.5
gpt-oss-20b,Placement,bedrooms,62.0
gpt-oss-20b,Placement,hssd_simple,74.5
gpt-oss-20b,Repositioning,kitchens,70.83
gpt-oss-20b,Repositioning,living_rooms,87.83
gpt-oss-20b,Repositioning,bedrooms,78.0
gpt-oss-20b,Repositioning,hssd_simple,40.0
gpt-oss-20b,Free Space,kitchens,94.83
gpt-oss-20b,Free Space,living_rooms,53.17
gpt-oss-20b,Free Space,bedrooms,74.0
gpt-oss-20b,Free Space,hssd_simple,9.0
gpt-oss-20b,Visibility,kitchens,91.5
gpt-oss-20b,Visibility,living_rooms,89.33
gpt-oss-20b,Visibility,bedrooms,89.17
gpt-oss-20b,Visibility,hssd_simple,45.5
gpt-oss-20b,View Angle,kitchens,93.5
gpt-oss-20b,View Angle,living_rooms,92.0
gpt-oss-20b,View Angle,bedrooms,91.33
gpt-oss-20b,View Angle,hssd_simple,37.5
gpt-oss-20b,Max Box,kitchens,31.33
gpt-oss-20b,Max Box,living_rooms,3.83
gpt-oss-20b,Max Box,bedrooms,5.83
gpt-oss-20b,Max Box,hssd_simple,0.5
gpt-oss-20b,Shortest Path (Valid),kitchens,43.0
gpt-oss-20b,Shortest Path (Valid),living_rooms,37.67
gpt-oss-20b,Shortest Path (Valid),bedrooms,30.0
gpt-oss-20b,Shortest Path (Valid),hssd_simple,13.5
gpt-oss-20b,Shortest Path (Frechet),kitchens,42.17
gpt-oss-20b,Shortest Path (Frechet),living_rooms,28.5
gpt-oss-20b,Shortest Path (Frechet),bedrooms,24.17
gpt-oss-20b,Shortest Path (Frechet),hssd_simple,14.0
Qwen3-30B-A3B-Thinking-2507,Pair Distance,kitchens,97.67
Qwen3-30B-A3B-Thinking-2507,Pair Distance,living_rooms,97.5
Qwen3-30B-A3B-Thinking-2507,Pair Distance,bedrooms,95.33
Qwen3-30B-A3B-Thinking-2507,Pair Distance,hssd_simple,18.0
Qwen3-30B-A3B-Thinking-2507,Placement,kitchens,68.17
Qwen3-30B-A3B-Thinking-2507,Placement,living_rooms,46.5
Qwen3-30B-A3B-Thinking-2507,Placement,bedrooms,34.5
Qwen3-30B-A3B-Thinking-2507,Placement,hssd_simple,28.5
Qwen3-30B-A3B-Thinking-2507,Repositioning,kitchens,61.5
Qwen3-30B-A3B-Thinking-2507,Repositioning,living_rooms,77.0
Qwen3-30B-A3B-Thinking-2507,Repositioning,bedrooms,69.17
Qwen3-30B-A3B-Thinking-2507,Repositioning,hssd_simple,27.0
Qwen3-30B-A3B-Thinking-2507,Free Space,kitchens,97.0
Qwen3-30B-A3B-Thinking-2507,Free Space,living_rooms,0.0
Qwen3-30B-A3B-Thinking-2507,Free Space,bedrooms,1.17
Qwen3-30B-A3B-Thinking-2507,Free Space,hssd_simple,1.0
Qwen3-30B-A3B-Thinking-2507,Visibility,kitchens,78.83
Qwen3-30B-A3B-Thinking-2507,Visibility,living_rooms,70.83
Qwen3-30B-A3B-Thinking-2507,Visibility,bedrooms,64.17
Qwen3-30B-A3B-Thinking-2507,Visibility,hssd_simple,3.0
Qwen3-30B-A3B-Thinking-2507,View Angle,kitchens,98.5
Qwen3-30B-A3B-Thinking-2507,View Angle,living_rooms,98.33
Qwen3-30B-A3B-Thinking-2507,View Angle,bedrooms,98.33
Qwen3-30B-A3B-Thinking-2507,View Angle,hssd_simple,26.0
Qwen3-30B-A3B-Thinking-2507,Max Box,kitchens,16.67
Qwen3-30B-A3B-Thinking-2507,Max Box,living_rooms,0.83
Qwen3-30B-A3B-Thinking-2507,Max Box,bedrooms,0.5
Qwen3-30B-A3B-Thinking-2507,Max Box,hssd_simple,0.0
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),kitchens,14.17
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),living_rooms,16.67
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),bedrooms,14.33
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),hssd_simple,1.0
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Frechet),kitchens,14.0
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Frechet),living_rooms,16.33
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Frechet),bedrooms,14.33
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Frechet),hssd_simple,1.0
