locations = ["early", "mid", "late"]
durations = ["long", "medium", "short"]

entities = ["adult",
            "baby",
            "bag",
            "ball",
            "ballon",
            "basket",
            "bat",
            "bed",
            "bench",
            "beverage",
            "bike",
            "bird",
            "blanket",
            "board",
            "book",
            "bottle",
            "bowl",
            "box",
            "bread",
            "brush",
            "bucket",
            "cabinet",
            "cake",
            "camera",
            "can",
            "candle",
            "car",
            "card",
            "carpet",
            "cart",
            "cat",
            "cellphone",
            "chair",
            "child",
            "chopstick",
            "cloth",
            "computer",
            "condiment",
            "cookie",
            "countertop",
            "cover",
            "cup",
            "curtain",
            "dog",
            "door",
            "drawer",
            "dustbin",
            "egg",
            "fan",
            "faucet",
            "fence",
            "flower",
            "fork",
            "fridge",
            "fruit",
            "gift",
            "glass",
            "glasses",
            "glove",
            "grain",
            "guitar",
            "hat",
            "helmet",
            "horse",
            "iron",
            "knife",
            "light",
            "lighter",
            "mat",
            "meat",
            "microphone",
            "microwave",
            "mop",
            "net",
            "noodle",
            "others",
            "oven",
            "pan",
            "paper",
            "piano",
            "pillow",
            "pizza",
            "plant",
            "plate",
            "pot",
            "powder",
            "rack",
            "racket",
            "rag",
            "ring",
            "scissor",
            "shelf",
            "shoe",
            "simmering",
            "sink",
            "slide",
            "sofa",
            "spatula",
            "sponge",
            "spoon",
            "spray",
            "stairs",
            "stand",
            "stove",
            "switch",
            "table",
            "teapot",
            "towel",
            "toy",
            "tray",
            "tv",
            "vaccum",
            "vegetable",
            "washer",
            "window"
            "ceiling",
            "floor",
            "grass",
            "ground",
            "rock",
            "sand",
            "sky",
            "snow",
            "tree",
            "wall",
            "water"
        ]

relations = [
        "beside",
        "biting",
        "blowing",
        "brushing",
        "caressing",
        "carrying",
        "catching",
        "chasing",
        "cleaning",
        "closing",
        "cooking",
        "cutting",
        "drinking from",
        "eating",
        "entering",
        "feeding",
        "grabbing",
        "guiding",
        "hanging from",
        "hitting",
        "holding",
        "hugging",
        "in",
        "in front of",
        "jumping from",
        "jumping over",
        "kicking",
        "kissing",
        "licking",
        "lighting",
        "looking at",
        "lying on",
        "next to",
        "on",
        "opening",
        "over",
        "picking",
        "playing",
        "playing with",
        "pointing to",
        "pulling",
        "pushing",
        "riding",
        "running on",
        "shaking hand with",
        "sitting on",
        "standing on",
        "stepping on",
        "stirring",
        "swinging",
        "talking to",
        "throwing",
        "touching",
        "toward",
        "walking on",
        "watering",
        "wearing"
    ]

user = '''
You are a super user in logic programming. 
'''

context = f'''
The entites in the video can be: {', '.join(entities)}.
The relations in the video can be: {', '.join(relations)}.

Here are some words that describes the event length and location that you can use. 
The location in the video can be: {', '.join(locations)}.
The duration of the event can be: {', '.join(durations)}.
'''

example1 = '''
Caption: A man carries a child and walks to the left from behind a woman holding another child.
Action json:
{   
    "caption": "A man carries a child and walks to the left from behind a woman holding another child.",
    "sequential descriptions": [
        "man A carry child B, women C hold child D, man A is behind women C", 
        "man A walk"
        "man A at left",
        ],
    "time stamps": {
        "1": {
            "description": [
                "man A carry child B", 
                "women C hold child D", 
                "man A is behind women C"
            ],
            "programmatic": [
                "carrying(A, B)",
                "name(A, man)",
                "name(B, child)",
                "holding(C, D)",
                "name(C, women)",
                "name(D, man)",
                "behind(A, C)",
                
            ],
            "duration": "short",
            "video location": "early"
        },
        "2": {
            "description": [
               "man A walk"
            ],
            "programmatic": [
                "walk(A)",
            ],
            "duration": "medium",
            "video location": "mid"
        },
        "3": {
            "description": [
               "man A at left"
            ],
            "programmatic": [
                "left(A)",
            ],
            "duration": "short",
            "video location": "late"
        }
    },
}
'''

example2 = '''
Caption: The woman rocks and holds the child, singing a birthday song together with another woman to celebrate the birthday of the girl.
Action json:
{
    "caption": "The woman rocks and holds the child, singing a birthday song together with another woman to celebrate the birthday of the girl.",
    "sequential descriptions": [
        "woman A rocks and holds the child B, woman A and women C sings birthday song", 
        ],
    "time stamps": {
        "1": {
            "decription": [
                "woman A rocks and holds the child B, woman A and women C sings birthday song", 
            ],
            "programmatic": [
                "rock(A, B)",
                "hold(A, B)",
                "sing(A)",
                "sing(B)"
            ],
            "duration": "long",
            "video location": "mid"
        }
    }
}
'''

example3 = '''
Caption: "I adjusted my cellphone and continued playing the ukulele."
Action json:
{
    "caption": "I adjusted my cellphone and continued playing the ukulele.",
    "sequential descriptions": [
        "person A adjust cellphone B",
        "person A play ukulele C", 
        ],
    "time stamps": {
        "1": {
            "decription": [
                "person A adjust cellphone B",
            ],
            "programmatic": [
                "adjust(A, B)",
                "name(A, person)",
                "name(B, cellphone)"
            ],
            "duration": "short",
            "video location": "early"
        },
        "2": {
            "decription": [
                "person A play ukulele C",
            ],
            "programmatic": [
                "play(A, C)",
                "name(A, person)",
                "name(B, ukulele)"
            ],
            "duration": "long",
            "video location": "late"
        }
    }
}
'''

example4 = '''
Caption: "A woman is teasing a kitten with a piece of meat, and the kitten is peeking its head from a chair to look at the meat."
Action json:
{
    "caption": "A woman is teasing a kitten with a piece of meat, and the kitten is peeking its head from a chair to look at the meat.",
    "sequential descriptions": [
        "woman A teasing kitten B with meat C",
        "kitten B peek at meat C from a chair D", 
        ],
    "time stamps": {
        "1": {
            "decription": [
                "woman A teasing kitten B with meat C",
            ],
            "programmatic": [
                "sitting on(B, D)",
                "name(B, cat)",
                "name(D, chair)",
                "name(A, adult)",
                "name(C, meat)"
            ],
            "duration": "long",
            "video location": "early"
        },
        "2": {
            "decription": [
                "kitten B peek at meat C from a chair D",
            ],
            "programmatic": [
                "catching(B, C)",
                "sitting on(B, D)",
            ],
            "duration": "long",
            "video location": "late"
        }
    }
}
'''


example5 = '''
Caption: "The young boy receives another gift and sits on the floor."
Action json:
{
    "caption": "The young boy receives another gift and sits on the floor.",
    "sequential descriptions": [
        "boy A receives gift B",
        "boy A sits on the floor C", 
        ],
    "time stamps": {
        "1": {
            "decription": [
                "boy A receives gift B",
            ],
            "programmatic": [
                "holding(B, D)",
                "name(A, boy)",
                "name(B, gift)",
            ],
            "duration": "medium",
            "video location": "early"
        },
        "2": {
            "decription": [
                "boy A sits on the floor C",
            ],
            "programmatic": [
                "sitting on(A, C)",
                "name(A, boy)",
                "name(C, floor)",
            ],
            "duration": "medium",
            "video location": "late"
        }
    }
}
'''

# example5 = '''
# Caption: "The young boy (9) walks through the television (5) to pick up a gift (4)."
# Action json:
# {
    
#     "sequential descriptions": [
#         "woman A teasing kitten B with meat C",
#         "kitten B peek at meat C from a chair D", 
#         ],
# }
    
# '''

query = '''
Note all the predicates are unary or binary.
A unary predicate takes in one argument. For example, close(A) means A is close to the camera.
A binary predicate takes in two arguments. For example, above(A, B) means A is above B.
Please use as many predicates as possible to precisely describe the action.
Please generate the action json programs for the following captions in the following format:
{"actions": {caption_id: action json programs}}
'''
    
all_examples = [example1, example4, example5]
few_shot_prompt = '\n'.join(all_examples)
prompt = '\n'.join([user, context, few_shot_prompt, query])

def wrap_prompt(caption_ls, few_shot=True):
    output_prompt = [context]
    if few_shot:
        output_prompt.append(few_shot_prompt)
        
    output_prompt.append(query)
    
    for cid, caption in enumerate(caption_ls): 
        output_prompt.append(f"{cid}. {caption}")

    return '\n'.join(output_prompt)

if __name__ == "__main__":
    
    print(prompt)