experiment run details:
  dataset: hoi
  path: /gpfs/mariana/home/envomp/bongard/big_hake/
  prompt method: direct
  prompt structure: labeled

---------------------------------------
  test split name: test_seen_obj_seen_act
---------------------------------------

0 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
1 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
2 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
3 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
4 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
5 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
6 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
7 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
8 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
9 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
10 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
11 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
12 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
13 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
14 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
15 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
16 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
17 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
18 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
19 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
20 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
21 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
22 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
23 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
24 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
25 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
26 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
27 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
28 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
29 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
30 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
31 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
32 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
33 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
34 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
35 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
36 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
37 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
38 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
39 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
40 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
41 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
42 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
43 | expected:'cat_1' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not there is a high-backed chair visible in the image. All the images in `cat_2` have at least one high-backed chair visible. All the images in `cat_1` do not have a high-backed chair. The test image has a high-backed chair, therefore it belongs to `cat_2`.']'
44 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
45 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
46 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
47 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
48 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
49 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
50 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
51 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
52 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
53 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
54 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
55 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
56 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
57 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
58 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
59 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
60 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
61 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
62 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
63 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
64 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
65 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
66 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
67 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
68 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
69 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
70 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
71 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
72 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
73 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
74 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
75 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
76 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
77 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
78 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
79 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
80 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
81 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
82 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
83 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
84 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
85 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
86 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
87 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
88 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
89 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
90 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not the image contains people holding drinks (glasses, mugs, cups). The cat_2 images all feature people holding drinks. The cat_1 images feature people preparing drinks or with drinks in the background, but not actively *holding* them. The test image shows people holding drinks, therefore it belongs to cat_2.']'
91 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
92 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
93 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
94 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
95 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
96 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
97 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
98 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
99 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
100 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
101 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
102 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
103 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
104 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
105 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person is kissing the dog. In all the `cat_2` images, the person is actively kissing the dog on the mouth or nose. In all the `cat_1` images, the person is not kissing the dog. The test image shows a person walking a dog on a leash, and there is no kissing occurring. Therefore, the test image belongs to `cat_1`.']'
106 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
107 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
108 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
109 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
110 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
111 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
112 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
113 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
114 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
115 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
116 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
117 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
118 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
119 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
120 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
121 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
122 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
123 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
124 | expected:'cat_2' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether or not the motorcycle is leaning/tilted significantly. \n\n* **cat_2 images:** Show motorcycles leaning heavily into a turn, often in a race setting with multiple bikes.\n* **cat_1 images:** Show motorcycles in the air, or with a rider falling off, indicating a loss of control and not a controlled lean.\n\nThe test image shows a motorcycle upright and stable, with the rider waving. It does not exhibit the significant lean seen in the cat_2 images. Therefore, it belongs to cat_1.']'
125 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the motorcycle is in a racing/competition setting or not. \n\n* **cat_2 images** all depict motorcycles in a racing or competition environment (race track, multiple bikes racing together, etc.).\n* **cat_1 images** show motorcycles involved in crashes or stunts, or a single bike not in a race.\n\nThe test image shows a motorcycle being worked on by someone on the side of the road. This is not a racing or competition setting. Therefore, it belongs to cat_1.']'
126 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
127 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
128 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
129 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
130 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
131 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
132 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
133 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
134 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
135 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
136 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
137 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
138 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
139 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
140 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
141 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
142 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
143 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
144 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
145 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
146 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
147 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
148 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
149 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
150 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
151 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
152 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
153 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
154 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
155 | expected:'cat_1' | got='cat_1 | full: [" cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether or not a person's face is visible in the image. \n\n*   **cat_2 images:** All show hands typing on a keyboard *without* a visible face.\n*   **cat_1 images:** All show a person's face *with* a keyboard.\n\nThe test image shows a hand using a cleaning gel on a keyboard, but no face is visible. Therefore, it follows the rule of cat_2. However, the images provided for cat_1 all have a face visible. The test image does not have a face visible, so it does not follow the rule of cat_1.\n\nTherefore, the test image is categorized as cat_1."]'
156 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
157 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
158 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
159 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
160 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
161 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
162 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
163 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
164 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
165 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
166 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
167 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
168 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
169 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
170 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
171 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
172 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
173 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
174 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
175 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
176 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
177 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
178 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
179 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
180 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
181 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
182 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
183 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
184 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
185 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
186 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
187 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
188 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
189 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
190 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
191 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
192 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
193 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
194 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
195 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
196 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
197 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person is actively disassembling/repairing a laptop. \n\n*   **cat_2 images** all show a person actively taking apart a laptop, with the internal components visible and tools being used.\n*   **cat_1 images** show people using laptops in various settings (looking at the screen, typing, etc.) but not actively disassembling them.\n\nThe test image shows a large group of people using laptops in a lecture hall or conference setting. No one is shown disassembling a laptop. Therefore, it follows the pattern of cat_1.']'
198 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
199 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
---------------------------------------
Summary for Split 'test_seen_obj_seen_act':
 results: {'correct': {'cat_1': 76, 'cat_2': 75}, 'incorrect': {'cat_1': 24, 'cat_2': 25}}
 accuracy: 75.50%

---------------------------------------
  test split name: test_seen_obj_unseen_act
---------------------------------------

0 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
1 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
2 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
3 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
4 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
5 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
6 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
7 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
8 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not people are toasting with glasses. The first six images (cat_2) all show people clinking glasses together in a celebratory toast. The next six images (cat_1) do *not* show people toasting with glasses - they are either drinking alone, or the glasses are not being clinked together. The test image shows two hands clinking glasses together, therefore it follows the rule and belongs to cat_2.']'
9 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
10 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
11 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
12 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
13 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
14 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
15 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
16 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
17 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
18 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
19 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
20 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
21 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
22 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
23 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
24 | expected:'cat_2' | got='cat_2 | full: [" cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not the person is directly interacting with the laptop (typing, using a mouse, etc.). \n\n*   **cat_2 images:** All show people actively using a laptop - typing, using a mouse, or directly looking at the screen while interacting with it.\n*   **cat_1 images:** All show people holding a laptop, or with a laptop in their lap, but not actively using it. They are either holding it up, or it's just present in the scene without direct interaction.\n\nThe test image shows a person actively typing on a laptop. Therefore, it follows the rule of cat_2."]'
25 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
26 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
27 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
28 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
29 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
30 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
31 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is actively performing a skateboarding trick or maneuver. \n\n*   **cat_2 images** all depict individuals *in the middle of* a trick or actively using the skateboard for a maneuver (jumping, grinding, etc.).\n*   **cat_1 images** show people either not on the skateboard, or not actively performing a trick (e.g., walking, sitting, reading).\n\nThe test image shows a group of people sitting and talking, with skateboards nearby, but no one is actively skateboarding or performing a trick. Therefore, it aligns with the characteristics of cat_1.']'
32 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
33 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
34 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the glasses contain a dark liquid (red wine, etc.). The cat_2 images all show people toasting with glasses containing dark-colored liquids. The cat_1 images show people drinking from glasses with lighter-colored liquids or in less formal settings. The test image shows people toasting with glasses containing a dark liquid, therefore it belongs to cat_2.']'
35 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
36 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
37 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
38 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
39 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
40 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
41 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
42 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
43 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
44 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
45 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
46 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
47 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
48 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
49 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
50 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
51 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
52 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
53 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
54 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
55 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
56 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
57 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
58 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
59 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
60 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the laptop is being used by a person or people in a collaborative or shared viewing/working manner. \n\n*   **cat_2 images:** Show two or more people looking at or interacting with the laptop screen together.\n*   **cat_1 images:** Show a single person using the laptop, often in a more professional or focused setting (e.g., a conference, coding).\n\nThe test image shows two children looking at the laptop screen together. Therefore, it follows the rule of cat_2.']'
61 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
62 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
63 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
64 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
65 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
66 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
67 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
68 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
69 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
70 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
71 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
72 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
73 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
74 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
75 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
76 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
77 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
78 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
79 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
80 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
81 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
82 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
83 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
84 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
85 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
86 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
87 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
88 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
89 | expected:'cat_1' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the people in the image are looking at each other while toasting. \n\n*   **cat_2 images:** All show people making eye contact while toasting.\n*   **cat_1 images:** All show people *not* making eye contact while toasting (looking at the camera, looking away, or a child is present).\n\nThe test image shows the two people looking at each other while toasting. Therefore, it follows the rule of cat_2.']'
90 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
91 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
92 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
93 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
94 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
95 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
96 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
97 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
98 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
99 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
100 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
101 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
102 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
103 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
104 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
105 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
106 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
107 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
108 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
109 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person is holding the knife in a threatening/fighting pose (cat_2) or using the knife for food preparation (cat_1). \n\nThe images in cat_2 all depict people in dynamic poses, seemingly ready to attack or defend with the knife. The images in cat_1 show people using knives to cut food items.\n\nThe test image shows a person cutting an onion on a cutting board, clearly indicating food preparation. Therefore, it belongs to cat_1.']'
110 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
111 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
112 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
113 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
114 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
115 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
116 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
117 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
118 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
119 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
120 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
121 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
122 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
123 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
124 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the boat is being propelled by oars (rowing). All images in `cat_2` show a person using oars to propel the boat. All images in `cat_1` show boats being propelled by motors or other means (or are simply moving with the current). The test image shows a boat being propelled by oars, therefore it belongs to `cat_2`.']'
125 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
126 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
127 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
128 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
129 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
130 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
131 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
132 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
133 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
134 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
135 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
136 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
137 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
138 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
139 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
140 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
141 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
142 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
143 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
144 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
145 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
146 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person holding the knife is looking directly at the camera. In the cat_2 images, the person holding the knife is looking directly at the camera. In the cat_1 images, the person is not looking directly at the camera. The test image shows the person holding the knife looking directly at the camera, therefore it belongs to cat_2.']'
147 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
148 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
149 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
150 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
151 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
152 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
153 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
154 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
155 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
156 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
157 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image has their tongue sticking out while holding a knife. All images in `cat_2` show a person with their tongue sticking out while holding a knife. All images in `cat_1` do not show this. The test image shows a person cutting food with a knife, but their tongue is not sticking out. Therefore, it belongs to `cat_1`.']'
158 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
159 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
160 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
161 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
162 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
163 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
164 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
165 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
166 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
167 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
168 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
169 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
170 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
171 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
172 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
173 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
174 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
175 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
176 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
177 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
178 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
179 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
180 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
181 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
182 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
183 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
184 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
185 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
186 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
187 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
188 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
189 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
190 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
191 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
192 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
193 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
194 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
195 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
196 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
197 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
198 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
199 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
---------------------------------------
Summary for Split 'test_seen_obj_unseen_act':
 results: {'correct': {'cat_1': 83, 'cat_2': 77}, 'incorrect': {'cat_1': 17, 'cat_2': 23}}
 accuracy: 80.00%

---------------------------------------
  test split name: test_unseen_obj_seen_act
---------------------------------------

0 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
1 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
2 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
3 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
4 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
5 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
6 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
7 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
8 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
9 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
10 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
11 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
12 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
13 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
14 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
15 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
16 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
17 | expected:'cat_1' | got='cat_2 | full: [" cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the surfer is *standing on the board* or *holding the board*. The cat_2 images all show a person holding a surfboard, typically walking on the beach. The cat_1 images all show a person actively surfing, standing on the board in the water. The test image shows a person standing on the board, therefore it belongs to cat_1. However, the question states that if the test image follows the rule, it's cat_2. The rule is that the person is holding the board. The test image shows a person standing on the board, which does not follow the rule. Therefore, it is cat_1.\n\nI apologize for the initial incorrect response. I misread the rule. The rule is that the person is holding the board. The test image shows a person standing on the board, which does not follow the rule. Therefore, it is cat_1.\n\nI apologize again for the repeated errors. I am still under development and learning to interpret these types of tasks correctly.\n\nThe rule is that the images in cat_2 show a person *holding* a surfboard. The test image shows a person *standing* on a surfboard. Therefore, the test image does *not* follow the rule and should be categorized as **cat_1**.\n"]'
18 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
19 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
20 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
21 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
22 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
23 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
24 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
25 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
26 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
27 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
28 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
29 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
30 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
31 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
32 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
33 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
34 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
35 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
36 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
37 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
38 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
39 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
40 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
41 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
42 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
43 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
44 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
45 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
46 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
47 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
48 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
49 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
50 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
51 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
52 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
53 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
54 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
55 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
56 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
57 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
58 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
59 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
60 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
61 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
62 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
63 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
64 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
65 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
66 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
67 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
68 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
69 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
70 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
71 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
72 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
73 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
74 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
75 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
76 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
77 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
78 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
79 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
80 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
81 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
82 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
83 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
84 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
85 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
86 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
87 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
88 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
89 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
90 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
91 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
92 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
93 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
94 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
95 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
96 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
97 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
98 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
99 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
100 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
101 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
102 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
103 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
104 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
105 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
106 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
107 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
108 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
109 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
110 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
111 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
112 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
113 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
114 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
115 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
116 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
117 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
118 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
119 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
120 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
121 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
122 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
123 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
124 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
125 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
126 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
127 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
128 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
129 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
130 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
131 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
132 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
133 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
134 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
135 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
136 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
137 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
138 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
139 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
140 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
141 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
142 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
143 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
144 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
145 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
146 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
147 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
148 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
149 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
150 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
151 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
152 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
153 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
154 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
155 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
156 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
157 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
158 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
159 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether or not there are other people clearly visible in the background of the image. \n\n*   **cat_2 images:** All have multiple people visible in the background, often blurred but present.\n*   **cat_1 images:** All have a more isolated background, with fewer or no clearly visible people.\n\nThe test image has a very sparse background with trees and a path, and no clearly visible people. Therefore, it follows the pattern of cat_1 images.']'
160 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
161 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
162 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
163 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
164 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
165 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
166 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
167 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
168 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
169 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the image contains a person holding a bag/purse. All images in `cat_2` show a person holding a bag or purse. All images in `cat_1` do not show a person holding a bag or purse.\n\nThe test image shows a person with a bag on the floor next to them, but they are not holding it. Therefore, it belongs to `cat_1`.']'
170 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
171 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
172 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
173 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
174 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
175 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
176 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
177 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
178 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
179 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
180 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
181 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
182 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not people are playing chess. All the images in `cat_2` show people playing chess. The images in `cat_1` do not show people playing chess. The test image shows people sitting in a room, but there is no evidence of chess being played. Therefore, it belongs to `cat_1`.']'
183 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
184 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not scissors are being used to cut something. \n\n*   **cat_2 images:** All show scissors being used to cut something (paper, ribbon, hair).\n*   **cat_1 images:** All show scissors being used to cut hair.\n\nThe test image shows scissors being used to cut a ribbon. Therefore, it follows the rule of cat_2.']'
185 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
186 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
187 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
188 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
189 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
190 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
191 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
192 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
193 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
194 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
195 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
196 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
197 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
198 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
199 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
---------------------------------------
Summary for Split 'test_unseen_obj_seen_act':
 results: {'correct': {'cat_1': 73, 'cat_2': 72}, 'incorrect': {'cat_1': 27, 'cat_2': 28}}
 accuracy: 72.50%

---------------------------------------
  test split name: test_unseen_obj_unseen_act
---------------------------------------

0 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
1 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
2 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
3 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
4 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
5 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
6 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
7 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
8 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
9 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
10 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not the person in the image is wearing a tie. All images in `cat_2` show people wearing a tie. All images in `cat_1` show people *not* wearing a tie, or are in the process of adjusting/removing one. The test image shows a person wearing a tie. Therefore, it belongs to `cat_2`.']'
11 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
12 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
13 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
14 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
15 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
16 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
17 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
18 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
19 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
20 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
21 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
22 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
23 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
24 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
25 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
26 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
27 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
28 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
29 | expected:'cat_1' | got='cat_2 | full: [" cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is actively adjusting or wearing a tie. All images in `cat_2` show a person in the process of tying or adjusting their tie. The `cat_1` images show people wearing hats or other accessories, or in a situation where they are not actively adjusting their tie.\n\nThe test image shows a man adjusting a woman's tie. Therefore, it follows the rule established by `cat_2`."]'
30 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
31 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
32 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
33 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
34 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
35 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
36 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
37 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
38 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
39 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
40 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
41 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
42 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
43 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
44 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
45 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
46 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
47 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
48 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
49 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
50 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
51 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
52 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
53 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
54 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
55 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
56 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is adjusting their neckwear (tie or bow tie). \n\n*   **cat_2 images:** All show people actively adjusting or in the process of adjusting their tie or bow tie.\n*   **cat_1 images:** None of the images show a person adjusting their tie or bow tie.\n\nThe test image shows a person adjusting his tie. Therefore, it belongs to `cat_2`.']'
57 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
58 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
59 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
60 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
61 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
62 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
63 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
64 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
65 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
66 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is adjusting their tie. All images in `cat_2` show a person actively adjusting or tightening their tie. All images in `cat_1` do not show this action. The test image shows a person adjusting their tie, therefore it belongs to `cat_2`.']'
67 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
68 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
69 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
70 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
71 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
72 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
73 | expected:'cat_1' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether someone is tying a tie. All images in `cat_2` show a person actively tying a tie around their neck. The `cat_1` images do not show someone tying a tie. The test image shows a person with a tie being adjusted around their neck, indicating the action of tying a tie. Therefore, it belongs to `cat_2`.']'
74 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
75 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
76 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
77 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
78 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
79 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
80 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
81 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
82 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
83 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
84 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
85 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
86 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
87 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
88 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
89 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
90 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
91 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
92 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
93 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
94 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
95 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
96 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
97 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
98 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
99 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
100 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
101 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
102 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is directly looking at the camera. \n\n*   **cat_2 images:** All individuals are looking directly at the camera.\n*   **cat_1 images:** All individuals are *not* looking directly at the camera.\n\nThe test image shows a person looking directly at the camera. Therefore, it follows the rule of cat_2 and should be categorized as such.']'
103 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
104 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
105 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
106 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
107 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
108 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
109 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
110 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
111 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
112 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person having their tie adjusted is wearing a suit jacket. In all the `cat_2` images, the person having their tie adjusted is wearing a suit jacket. In all the `cat_1` images, the person having their tie adjusted is *not* wearing a suit jacket.\n\nThe test image shows a person wearing a suit jacket while having their tie adjusted. Therefore, it belongs to `cat_2`.']'
113 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
114 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
115 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
116 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
117 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
118 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
119 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
120 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
121 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
122 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
123 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
124 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
125 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
126 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
127 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
128 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
129 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
130 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
131 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
132 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
133 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
134 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
135 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
136 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
137 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
138 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
139 | expected:'cat_1' | got='cat_1 | full: [' cat_1\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is *picking* an apple from a tree. \n\n*   **cat_2 images** all show people (or a person lifting a child) reaching for and picking apples from trees.\n*   **cat_1 images** all show people eating or processing apples that are already picked (peeling, biting into, holding a cut apple).\n\nThe test image shows a person peeling an apple, which is a processing action, not picking. Therefore, it belongs to cat_1.']'
140 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether or not the person in the image is looking directly at the camera. \n\n*   **cat_2 images:** All individuals are looking directly at the camera.\n*   **cat_1 images:** All individuals are *not* looking directly at the camera.\n\nThe test image shows a man looking directly at the camera. Therefore, it follows the rule of cat_2 and should be categorized as such.']'
141 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
142 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
143 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
144 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
145 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
146 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
147 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
148 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
149 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
150 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
151 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
152 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
153 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
154 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
155 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
156 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
157 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
158 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
159 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
160 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
161 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
162 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
163 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
164 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
165 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
166 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
167 | expected:'cat_1' | got='cat_2 | full: [' cat_2']'
168 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
169 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
170 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
171 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
172 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
173 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
174 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
175 | expected:'cat_1' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe key difference between the two categories appears to be whether the sheep is being actively *shorn* (cat_2) or is being *held/handled* in a more general way (cat_1). \n\nThe cat_2 images all depict someone actively shearing a sheep with clippers. The cat_1 images show people holding, carrying, or standing near sheep without the shearing process occurring.\n\nThe test image shows people touching and interacting with a sheep, but there is no evidence of shearing taking place. Therefore, it aligns with the characteristics of cat_2.']'
176 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
177 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
178 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
179 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
180 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
181 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
182 | expected:'cat_2' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person holding the remote is looking at the remote. In all the `cat_2` images, the person is looking at the remote. In all the `cat_1` images, the person is *not* looking at the remote.\n\nIn the test image, the woman is looking at the remote. Therefore, it belongs to `cat_2`.']'
183 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
184 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
185 | expected:'cat_1' | got='cat_2 | full: [' cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person in the image is looking at a camera. \n\n*   **cat\\_2 images:** All show a person looking directly at the camera.\n*   **cat\\_1 images:** None of the images show a person looking directly at the camera.\n\nThe test image shows a person looking directly at the camera. Therefore, it belongs to `cat_2`.']'
186 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
187 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
188 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
189 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
190 | expected:'cat_2' | got='cat_2 | full: [" cat_2\n\n**Reasoning:**\n\nThe rule appears to be whether the person being brushed is a child. In the cat_2 images, a person is brushing another person's teeth, and the person being brushed is a child. In the cat_1 images, the toothbrush is not being used to brush someone else's teeth. The test image shows a person brushing a child's teeth, therefore it follows the rule and is categorized as cat_2."]'
191 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
192 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
193 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
194 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
195 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
196 | expected:'cat_2' | got='cat_1 | full: [' cat_1']'
197 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
198 | expected:'cat_2' | got='cat_2 | full: [' cat_2']'
199 | expected:'cat_1' | got='cat_1 | full: [' cat_1']'
---------------------------------------
Summary for Split 'test_unseen_obj_unseen_act':
 results: {'correct': {'cat_1': 76, 'cat_2': 78}, 'incorrect': {'cat_1': 24, 'cat_2': 22}}
 accuracy: 77.00%

---------------------------------------
