model,question,room_type,score
gpt-5-2025-08-07,Pair Distance,kitchens,0.0
gpt-5-2025-08-07,Pair Distance,living_rooms,0.0
gpt-5-2025-08-07,Pair Distance,bedrooms,0.0
gpt-5-2025-08-07,Pair Distance,hssd_simple,0.0
gpt-5-2025-08-07,Placement,kitchens,13.17
gpt-5-2025-08-07,Placement,living_rooms,22.67
gpt-5-2025-08-07,Placement,bedrooms,36.17
gpt-5-2025-08-07,Placement,hssd_simple,27.0
gpt-5-2025-08-07,Repositioning,kitchens,0.33
gpt-5-2025-08-07,Repositioning,living_rooms,0.0
gpt-5-2025-08-07,Repositioning,bedrooms,0.17
gpt-5-2025-08-07,Repositioning,hssd_simple,2.0
gpt-5-2025-08-07,Free Space,kitchens,1.17
gpt-5-2025-08-07,Free Space,living_rooms,0.17
gpt-5-2025-08-07,Free Space,bedrooms,0.17
gpt-5-2025-08-07,Free Space,hssd_simple,5.0
gpt-5-2025-08-07,Visibility,kitchens,0.0
gpt-5-2025-08-07,Visibility,living_rooms,0.0
gpt-5-2025-08-07,Visibility,bedrooms,0.0
gpt-5-2025-08-07,Visibility,hssd_simple,0.0
gpt-5-2025-08-07,View Angle,kitchens,0.0
gpt-5-2025-08-07,View Angle,living_rooms,0.0
gpt-5-2025-08-07,View Angle,bedrooms,0.0
gpt-5-2025-08-07,View Angle,hssd_simple,0.0
gpt-5-2025-08-07,Max Box,kitchens,0.0
gpt-5-2025-08-07,Max Box,living_rooms,0.0
gpt-5-2025-08-07,Max Box,bedrooms,0.0
gpt-5-2025-08-07,Max Box,hssd_simple,0.0
gpt-5-2025-08-07,Shortest Path (Valid),kitchens,0.17
gpt-5-2025-08-07,Shortest Path (Valid),living_rooms,61.67
gpt-5-2025-08-07,Shortest Path (Valid),bedrooms,0.5
gpt-5-2025-08-07,Shortest Path (Valid),hssd_simple,0.0
gpt-oss-120b,Pair Distance,kitchens,0.0
gpt-oss-120b,Pair Distance,living_rooms,0.0
gpt-oss-120b,Pair Distance,bedrooms,0.0
gpt-oss-120b,Pair Distance,hssd_simple,11.0
gpt-oss-120b,Placement,kitchens,0.0
gpt-oss-120b,Placement,living_rooms,0.17
gpt-oss-120b,Placement,bedrooms,0.17
gpt-oss-120b,Placement,hssd_simple,0.0
gpt-oss-120b,Repositioning,kitchens,0.0
gpt-oss-120b,Repositioning,living_rooms,0.0
gpt-oss-120b,Repositioning,bedrooms,0.0
gpt-oss-120b,Repositioning,hssd_simple,0.5
gpt-oss-120b,Free Space,kitchens,0.0
gpt-oss-120b,Free Space,living_rooms,0.17
gpt-oss-120b,Free Space,bedrooms,0.0
gpt-oss-120b,Free Space,hssd_simple,28.5
gpt-oss-120b,Visibility,kitchens,1.5
gpt-oss-120b,Visibility,living_rooms,1.67
gpt-oss-120b,Visibility,bedrooms,1.17
gpt-oss-120b,Visibility,hssd_simple,5.0
gpt-oss-120b,View Angle,kitchens,0.0
gpt-oss-120b,View Angle,living_rooms,0.0
gpt-oss-120b,View Angle,bedrooms,0.0
gpt-oss-120b,View Angle,hssd_simple,7.5
gpt-oss-120b,Max Box,kitchens,0.0
gpt-oss-120b,Max Box,living_rooms,0.0
gpt-oss-120b,Max Box,bedrooms,0.0
gpt-oss-120b,Max Box,hssd_simple,0.0
gpt-oss-120b,Shortest Path (Valid),kitchens,0.5
gpt-oss-120b,Shortest Path (Valid),living_rooms,1.17
gpt-oss-120b,Shortest Path (Valid),bedrooms,0.5
gpt-oss-120b,Shortest Path (Valid),hssd_simple,3.0
DeepSeek-R1-0528,Pair Distance,kitchens,1.5
DeepSeek-R1-0528,Pair Distance,living_rooms,0.83
DeepSeek-R1-0528,Pair Distance,bedrooms,2.5
DeepSeek-R1-0528,Pair Distance,hssd_simple,70.0
DeepSeek-R1-0528,Placement,kitchens,5.17
DeepSeek-R1-0528,Placement,living_rooms,7.17
DeepSeek-R1-0528,Placement,bedrooms,8.17
DeepSeek-R1-0528,Placement,hssd_simple,6.0
DeepSeek-R1-0528,Repositioning,kitchens,8.33
DeepSeek-R1-0528,Repositioning,living_rooms,7.5
DeepSeek-R1-0528,Repositioning,bedrooms,3.17
DeepSeek-R1-0528,Repositioning,hssd_simple,33.5
DeepSeek-R1-0528,Free Space,kitchens,5.5
DeepSeek-R1-0528,Free Space,living_rooms,44.5
DeepSeek-R1-0528,Free Space,bedrooms,12.17
DeepSeek-R1-0528,Free Space,hssd_simple,79.5
DeepSeek-R1-0528,Visibility,kitchens,26.67
DeepSeek-R1-0528,Visibility,living_rooms,46.83
DeepSeek-R1-0528,Visibility,bedrooms,44.5
DeepSeek-R1-0528,Visibility,hssd_simple,87.0
DeepSeek-R1-0528,View Angle,kitchens,25.5
DeepSeek-R1-0528,View Angle,living_rooms,30.83
DeepSeek-R1-0528,View Angle,bedrooms,23.83
DeepSeek-R1-0528,View Angle,hssd_simple,84.5
DeepSeek-R1-0528,Max Box,kitchens,30.17
DeepSeek-R1-0528,Max Box,living_rooms,63.5
DeepSeek-R1-0528,Max Box,bedrooms,59.33
DeepSeek-R1-0528,Max Box,hssd_simple,58.5
DeepSeek-R1-0528,Shortest Path (Valid),kitchens,78.5
DeepSeek-R1-0528,Shortest Path (Valid),living_rooms,77.83
DeepSeek-R1-0528,Shortest Path (Valid),bedrooms,79.5
DeepSeek-R1-0528,Shortest Path (Valid),hssd_simple,82.0
gpt-5-mini-2025-08-07,Pair Distance,kitchens,0.0
gpt-5-mini-2025-08-07,Pair Distance,living_rooms,0.17
gpt-5-mini-2025-08-07,Pair Distance,bedrooms,0.17
gpt-5-mini-2025-08-07,Pair Distance,hssd_simple,67.0
gpt-5-mini-2025-08-07,Placement,kitchens,6.5
gpt-5-mini-2025-08-07,Placement,living_rooms,10.33
gpt-5-mini-2025-08-07,Placement,bedrooms,12.83
gpt-5-mini-2025-08-07,Placement,hssd_simple,16.5
gpt-5-mini-2025-08-07,Repositioning,kitchens,0.0
gpt-5-mini-2025-08-07,Repositioning,living_rooms,0.0
gpt-5-mini-2025-08-07,Repositioning,bedrooms,0.0
gpt-5-mini-2025-08-07,Repositioning,hssd_simple,28.5
gpt-5-mini-2025-08-07,Free Space,kitchens,0.0
gpt-5-mini-2025-08-07,Free Space,living_rooms,1.67
gpt-5-mini-2025-08-07,Free Space,bedrooms,1.0
gpt-5-mini-2025-08-07,Free Space,hssd_simple,85.0
gpt-5-mini-2025-08-07,Visibility,kitchens,0.0
gpt-5-mini-2025-08-07,Visibility,living_rooms,0.33
gpt-5-mini-2025-08-07,Visibility,bedrooms,1.5
gpt-5-mini-2025-08-07,Visibility,hssd_simple,56.5
gpt-5-mini-2025-08-07,View Angle,kitchens,11.17
gpt-5-mini-2025-08-07,View Angle,living_rooms,14.33
gpt-5-mini-2025-08-07,View Angle,bedrooms,12.67
gpt-5-mini-2025-08-07,View Angle,hssd_simple,73.0
gpt-5-mini-2025-08-07,Max Box,kitchens,2.0
gpt-5-mini-2025-08-07,Max Box,living_rooms,5.67
gpt-5-mini-2025-08-07,Max Box,bedrooms,4.67
gpt-5-mini-2025-08-07,Max Box,hssd_simple,22.5
gpt-5-mini-2025-08-07,Shortest Path (Valid),kitchens,23.5
gpt-5-mini-2025-08-07,Shortest Path (Valid),living_rooms,19.83
gpt-5-mini-2025-08-07,Shortest Path (Valid),bedrooms,17.5
gpt-5-mini-2025-08-07,Shortest Path (Valid),hssd_simple,65.5
Gemini Flash 2.5,Pair Distance,kitchens,3.33
Gemini Flash 2.5,Pair Distance,living_rooms,4.0
Gemini Flash 2.5,Pair Distance,bedrooms,4.33
Gemini Flash 2.5,Pair Distance,hssd_simple,86.5
Gemini Flash 2.5,Placement,kitchens,39.17
Gemini Flash 2.5,Placement,living_rooms,45.83
Gemini Flash 2.5,Placement,bedrooms,63.33
Gemini Flash 2.5,Placement,hssd_simple,84.0
Gemini Flash 2.5,Repositioning,kitchens,1.0
Gemini Flash 2.5,Repositioning,living_rooms,2.17
Gemini Flash 2.5,Repositioning,bedrooms,1.67
Gemini Flash 2.5,Repositioning,hssd_simple,76.0
Gemini Flash 2.5,Free Space,kitchens,2.0
Gemini Flash 2.5,Free Space,living_rooms,41.67
Gemini Flash 2.5,Free Space,bedrooms,30.67
Gemini Flash 2.5,Free Space,hssd_simple,96.5
Gemini Flash 2.5,Visibility,kitchens,72.83
Gemini Flash 2.5,Visibility,living_rooms,88.17
Gemini Flash 2.5,Visibility,bedrooms,88.33
Gemini Flash 2.5,Visibility,hssd_simple,99.5
Gemini Flash 2.5,View Angle,kitchens,5.5
Gemini Flash 2.5,View Angle,living_rooms,5.0
Gemini Flash 2.5,View Angle,bedrooms,3.67
Gemini Flash 2.5,View Angle,hssd_simple,75.5
Gemini Flash 2.5,Max Box,kitchens,95.67
Gemini Flash 2.5,Max Box,living_rooms,100.0
Gemini Flash 2.5,Max Box,bedrooms,100.0
Gemini Flash 2.5,Max Box,hssd_simple,100.0
Gemini Flash 2.5,Shortest Path (Valid),kitchens,96.17
Gemini Flash 2.5,Shortest Path (Valid),living_rooms,97.83
Gemini Flash 2.5,Shortest Path (Valid),bedrooms,98.0
Gemini Flash 2.5,Shortest Path (Valid),hssd_simple,100.0
gpt-oss-20b,Pair Distance,kitchens,0.67
gpt-oss-20b,Pair Distance,living_rooms,0.67
gpt-oss-20b,Pair Distance,bedrooms,0.67
gpt-oss-20b,Pair Distance,hssd_simple,39.5
gpt-oss-20b,Placement,kitchens,4.5
gpt-oss-20b,Placement,living_rooms,12.33
gpt-oss-20b,Placement,bedrooms,25.67
gpt-oss-20b,Placement,hssd_simple,15.0
gpt-oss-20b,Repositioning,kitchens,0.83
gpt-oss-20b,Repositioning,living_rooms,0.5
gpt-oss-20b,Repositioning,bedrooms,0.83
gpt-oss-20b,Repositioning,hssd_simple,22.5
gpt-oss-20b,Free Space,kitchens,0.83
gpt-oss-20b,Free Space,living_rooms,13.67
gpt-oss-20b,Free Space,bedrooms,7.83
gpt-oss-20b,Free Space,hssd_simple,77.5
gpt-oss-20b,Visibility,kitchens,0.33
gpt-oss-20b,Visibility,living_rooms,1.0
gpt-oss-20b,Visibility,bedrooms,0.17
gpt-oss-20b,Visibility,hssd_simple,29.0
gpt-oss-20b,View Angle,kitchens,1.0
gpt-oss-20b,View Angle,living_rooms,1.33
gpt-oss-20b,View Angle,bedrooms,1.17
gpt-oss-20b,View Angle,hssd_simple,38.0
gpt-oss-20b,Max Box,kitchens,28.83
gpt-oss-20b,Max Box,living_rooms,61.0
gpt-oss-20b,Max Box,bedrooms,56.0
gpt-oss-20b,Max Box,hssd_simple,33.0
gpt-oss-20b,Shortest Path (Valid),kitchens,36.17
gpt-oss-20b,Shortest Path (Valid),living_rooms,40.33
gpt-oss-20b,Shortest Path (Valid),bedrooms,50.33
gpt-oss-20b,Shortest Path (Valid),hssd_simple,32.5
Qwen3-30B-A3B-Thinking-2507,Pair Distance,kitchens,2.0
Qwen3-30B-A3B-Thinking-2507,Pair Distance,living_rooms,2.33
Qwen3-30B-A3B-Thinking-2507,Pair Distance,bedrooms,4.67
Qwen3-30B-A3B-Thinking-2507,Pair Distance,hssd_simple,72.0
Qwen3-30B-A3B-Thinking-2507,Placement,kitchens,29.5
Qwen3-30B-A3B-Thinking-2507,Placement,living_rooms,52.5
Qwen3-30B-A3B-Thinking-2507,Placement,bedrooms,64.5
Qwen3-30B-A3B-Thinking-2507,Placement,hssd_simple,70.5
Qwen3-30B-A3B-Thinking-2507,Repositioning,kitchens,7.17
Qwen3-30B-A3B-Thinking-2507,Repositioning,living_rooms,6.67
Qwen3-30B-A3B-Thinking-2507,Repositioning,bedrooms,8.33
Qwen3-30B-A3B-Thinking-2507,Repositioning,hssd_simple,63.0
Qwen3-30B-A3B-Thinking-2507,Free Space,kitchens,2.33
Qwen3-30B-A3B-Thinking-2507,Free Space,living_rooms,2.33
Qwen3-30B-A3B-Thinking-2507,Free Space,bedrooms,3.0
Qwen3-30B-A3B-Thinking-2507,Free Space,hssd_simple,98.5
Qwen3-30B-A3B-Thinking-2507,Visibility,kitchens,19.17
Qwen3-30B-A3B-Thinking-2507,Visibility,living_rooms,26.33
Qwen3-30B-A3B-Thinking-2507,Visibility,bedrooms,33.33
Qwen3-30B-A3B-Thinking-2507,Visibility,hssd_simple,96.0
Qwen3-30B-A3B-Thinking-2507,View Angle,kitchens,1.0
Qwen3-30B-A3B-Thinking-2507,View Angle,living_rooms,0.67
Qwen3-30B-A3B-Thinking-2507,View Angle,bedrooms,0.83
Qwen3-30B-A3B-Thinking-2507,View Angle,hssd_simple,66.0
Qwen3-30B-A3B-Thinking-2507,Max Box,kitchens,62.67
Qwen3-30B-A3B-Thinking-2507,Max Box,living_rooms,98.0
Qwen3-30B-A3B-Thinking-2507,Max Box,bedrooms,98.17
Qwen3-30B-A3B-Thinking-2507,Max Box,hssd_simple,99.0
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),kitchens,85.17
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),living_rooms,81.33
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),bedrooms,83.67
Qwen3-30B-A3B-Thinking-2507,Shortest Path (Valid),hssd_simple,98.0
