locations = ["early", "mid", "late"]
durations = ["long", "medium", "short"]

user = '''
You are a super user in logic programming.
'''

context = f'''
You will describe the event length and location in both natural language and fraction of the video. 
The natural language description of the locations in the video can be: {', '.join(locations)}.
The natural language description of the durations of the event can be: {', '.join(durations)}
Examples of precise video locations: [1/4, 1/2], [2/3, 1].
Examples of event durations: 1/4, 2/3, 1.
'''

example1 = '''
Caption: A man carries a child and walks to the left from behind a woman holding another child.
Action json:
{
    "caption": "A man carries a child and walks to the left from behind a woman holding another child.",
    "sequential descriptions": [
        "man A carry child B, women C hold child D, man A is behind women C",
        "man A walk"
        "man A at left",
        ],
    "time stamps": {
        "1": {
            "description": [
                "man A carry child B",
                "women C hold child D",
                "man A is behind women C"
            ],
            "programmatic": [
                "carrying(A, B)",
                "name(A, man)",
                "name(B, child)",
                "holding(C, D)",
                "name(C, women)",
                "name(D, man)",
                "behind(A, C)",

            ],
            "duration": "short"
            "duration precise": "1/4",
            "video location": "early", 
            "video location precise": [0, 1/4]"
        },
        "2": {
            "description": [
               "man A walk"
            ],
            "programmatic": [
                "walk(A)",
            ],
            "duration": "medium"
            "duration precise": "1/2",
            "video location": "mid"
            "video location precise": "[1/4, 3/4]"
        },
        "3": {
            "description": [
               "man A at left"
            ],
            "programmatic": [
                "left(A)",
            ],
            "duration": "short",
            "duration precise": "1/4",
            "video location": "late",
            "video location precise": "[3/4, 1]",
        }
    },
}
'''

example2 = '''
Caption: The woman rocks and holds the child, singing a birthday song together with another woman to celebrate the birthday of the girl.
Action json:
{
    "caption": "The woman rocks and holds the child, singing a birthday song together with another woman to celebrate the birthday of the girl.",
    "sequential descriptions": [
        "woman A rocks and holds the child B, woman A and women C sings birthday song",
        ],
    "time stamps": {
        "1": {
            "decription": [
                "woman A rocks and holds the child B, woman A and women C sings birthday song",
            ],
            "programmatic": [
                "rock(A, B)",
                "hold(A, B)",
                "sing(A)",
                "sing(B)"
            ],
            "duration": "long"
            "duration precise": "1",
            "video location": "mid", 
            "video location precise": "[0, 1]",
        }
    }
}
'''

example3 = '''
Caption: "I adjusted my cellphone and continued playing the ukulele."
Action json:
{
    "caption": "I adjusted my cellphone and continued playing the ukulele.",
    "sequential descriptions": [
        "person A adjust cellphone B",
        "person A play ukulele C",
        ],
    "time stamps": {
        "1": {
            "decription": [
                "person A adjust cellphone B",
            ],
            "programmatic": [
                "adjust(A, B)",
                "name(A, person)",
                "name(B, cellphone)"
            ],
            "duration": "short"
            "duration precise": "1/4",
            "video location": "early"
            "video location precise": "[0, 1/4]"
        },
        "2": {
            "decription": [
                "person A play ukulele C",
            ],
            "programmatic": [
                "play(A, C)",
                "name(A, person)",
                "name(B, ukulele)"
            ],
            "duration": "long", 
            "duration precise": "3/4",
            "video location": "late",
            "video location precise": "[1/4, 1]"
        }
    }
}
'''

example4 = '''
Caption: "A woman is teasing a kitten with a piece of meat, and the kitten is peeking its head from a chair to look at the meat."
Action json:
{
    "caption": "A woman is teasing a kitten with a piece of meat, and the kitten is peeking its head from a chair to look at the meat.",
    "sequential descriptions": [
        "woman A teasing kitten B with meat C",
        "kitten B peek at meat C from a chair D",
        ],
    "time stamps": {
        "1": {
            "decription": [
                "woman A teasing kitten B with meat C",
            ],
            "programmatic": [
                "sitting on(B, D)",
                "name(B, cat)",
                "name(D, chair)",
                "name(A, adult)",
                "name(C, meat)"
            ],
            "duration": "mid", 
            "duration precise":"1/2",
            "video location": "early, 
            "video location precise": "[0, 1/2]"
        },
        "2": {
            "decription": [
                "kitten B peek at meat C from a chair D",
            ],
            "programmatic": [
                "catching(B, C)",
                "sitting on(B, D)",
            ],
            "duration": "long",
            "duration precise": "1",
            "video location": "mid",
            "video location precise": "[0, 1]"
        }
    }
}
'''


example5 = '''
Caption: "The young boy receives another gift and sits on the floor."
Action json:
{
    "caption": "The young boy receives another gift and sits on the floor.",
    "sequential descriptions": [
        "boy A receives gift B",
        "boy A sits on the floor C",
        ],
    "time stamps": {
        "1": {
            "decription": [
                "boy A receives gift B",
            ],
            "programmatic": [
                "holding(B, D)",
                "name(A, boy)",
                "name(B, gift)",
            ],
            "duration": "mid",
            "duration precise": "1/2",
            "video location": "early",
            "video location precise":  "[0, 1/2]"
        },
        "2": {
            "decription": [
                "boy A sits on the floor C",
            ],
            "programmatic": [
                "sitting on(A, C)",
                "name(A, boy)",
                "name(C, floor)",
            ],
            "duration": "mid",
            "duration precise": "1/2",
            "video location": "late",
            "video location precise": "[1/2, 1]"
        }
    }
}
'''

# '''

query = '''
Note all the predicates are unary or binary.
A unary predicate takes in one argument. For example, close(A) means A is close to the camera.
A binary predicate takes in two arguments. For example, above(A, B) means A is above B.
Please use as many predicates as possible to precisely describe the action.
Please generate the action json programs for the following captions in the following format:
{"actions": {caption_id: action json programs}}
'''

all_examples = [example1, example4, example5]
few_shot_prompt = '\n'.join(all_examples)
prompt = '\n'.join([user, context, few_shot_prompt, query])

def wrap_prompt(caption_ls, few_shot=True):
    output_prompt = [context]
    if few_shot:
        output_prompt.append(few_shot_prompt)

    output_prompt.append(query)

    for cid, caption in enumerate(caption_ls):
        output_prompt.append(f"{cid}. {caption}")

    return '\n'.join(output_prompt)

if __name__ == "__main__":

    print(prompt)