{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0a8e438-6ea1-477b-ae35-aca95b571006",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import random\n",
    "import string\n",
    "from copy import deepcopy as dc\n",
    "\n",
    "class Node:\n",
    "    def __init__(self, value=None, left=None, right=None, next=None):\n",
    "        self.value = value\n",
    "        self.left = left\n",
    "        self.right = right\n",
    "        self.next = next\n",
    "\n",
    "class Stack:\n",
    "    def __init__(self):\n",
    "        self.head = None\n",
    "\n",
    "    def push(self, node):\n",
    "        if not self.head:\n",
    "            self.head = node\n",
    "        else:\n",
    "            node.next = self.head\n",
    "            self.head = node\n",
    "\n",
    "    def pop(self):\n",
    "        if self.head:\n",
    "            popped = self.head\n",
    "            self.head = self.head.next\n",
    "            return popped\n",
    "        else:\n",
    "            raise Exception(\"Stack is empty\")\n",
    "\n",
    "class ExpressionTree:\n",
    "\n",
    "    def get_subexpressions(self, root):\n",
    "        # print(\"function enter\")\n",
    "        if root is None:\n",
    "            return []\n",
    "\n",
    "        if root.left is None and root.right is None:\n",
    "            return [root.value]\n",
    "\n",
    "        subexpressions = []\n",
    "        left_subs = self.get_subexpressions(root.left)\n",
    "\n",
    "        right_subs = self.get_subexpressions(root.right)\n",
    "        for left_sub in left_subs:\n",
    "            for right_sub in right_subs:\n",
    "                subexpressions.extend([left_sub, right_sub, '('+left_sub+root.value+right_sub+')'])\n",
    "        return subexpressions\n",
    "\n",
    "class Conversion:\n",
    "    # Constructor to initialize the class variables\n",
    "    def __init__(self, capacity):\n",
    "        self.top = -1\n",
    "        self.capacity = capacity\n",
    "\n",
    "        # This array is used a stack\n",
    "        self.array = []\n",
    "\n",
    "        # Precedence setting\n",
    "        self.output = []\n",
    "        self.precedence = {'+': 1, '-': 1, '*': 2, '/': 2, '^': 3}\n",
    "\n",
    "    # Check if the stack is empty\n",
    "    def isEmpty(self):\n",
    "        return True if self.top == -1 else False\n",
    "\n",
    "    # Return the value of the top of the stack\n",
    "    def peek(self):\n",
    "        return self.array[-1]\n",
    "\n",
    "    # Pop the element from the stack\n",
    "    def pop(self):\n",
    "        if not self.isEmpty():\n",
    "            self.top -= 1\n",
    "            return self.array.pop()\n",
    "        else:\n",
    "            return \"$\"\n",
    "\n",
    "    # Push the element to the stack\n",
    "    def push(self, op):\n",
    "        self.top += 1\n",
    "        self.array.append(op)\n",
    "\n",
    "    # A utility function to check is the given character\n",
    "    # is operand\n",
    "    def isOperand(self, ch):\n",
    "        return ch.isalpha()\n",
    "\n",
    "    # Check if the precedence of operator is strictly\n",
    "    # less than top of stack or not\n",
    "    def notGreater(self, i):\n",
    "        try:\n",
    "            a = self.precedence[i]\n",
    "            b = self.precedence[self.peek()]\n",
    "            return True if a <= b else False\n",
    "        except KeyError:\n",
    "            return False\n",
    "\n",
    "    # The main function that\n",
    "    # converts given infix expression\n",
    "    # to postfix expression\n",
    "    def infixToPostfix(self, exp):\n",
    "\n",
    "        # Iterate over the expression for conversion\n",
    "        for i in exp:\n",
    "\n",
    "            # If the character is an operand,\n",
    "            # add it to output\n",
    "            if self.isOperand(i):\n",
    "                self.output.append(i)\n",
    "\n",
    "            # If the character is an '(', push it to stack\n",
    "            elif i == '(':\n",
    "                self.push(i)\n",
    "\n",
    "            # If the scanned character is an ')', pop and\n",
    "            # output from the stack until and '(' is found\n",
    "            elif i == ')':\n",
    "                while((not self.isEmpty()) and\n",
    "                      self.peek() != '('):\n",
    "                    a = self.pop()\n",
    "                    self.output.append(a)\n",
    "                if (not self.isEmpty() and self.peek() != '('):\n",
    "                    return -1\n",
    "                else:\n",
    "                    self.pop()\n",
    "\n",
    "            # An operator is encountered\n",
    "            else:\n",
    "                while(not self.isEmpty() and self.notGreater(i)):\n",
    "                    self.output.append(self.pop())\n",
    "                self.push(i)\n",
    "\n",
    "        # Pop all the operator from the stack\n",
    "        while not self.isEmpty():\n",
    "            self.output.append(self.pop())\n",
    "\n",
    "        return \"\".join(self.output)\n",
    "\n",
    "def labels_To_Expression(label_exp):\n",
    "    # 'Microwave oven' * ('Drill' + 'Non-motorized land vehicle' + 'Breaking')\n",
    "    strings = re.findall(r\"'(.*?)'\", label_exp)\n",
    "    notation = ['A','B','C','D']\n",
    "    replacement_map = dict(zip(strings, notation[:len(strings)]))\n",
    "    reverse_replacement_map = dict(zip(notation[:len(strings)], strings))\n",
    "\n",
    "    def replace_string(match):\n",
    "        string = match.group(1)\n",
    "        return replacement_map.get(string, string)\n",
    "\n",
    "    tokens = re.sub(r\"'(.*?)'\", replace_string, label_exp)\n",
    "    return tokens.replace(' ', ''), reverse_replacement_map\n",
    "\n",
    "def sub_Expression_to_label(sub_Exp, mapping):\n",
    "    translation_table = str.maketrans(mapping)\n",
    "    return sub_Exp.translate(translation_table)\n",
    "\n",
    "def sub_Expressions_to_labels(sub_Exps, mapping):\n",
    "    tokens = []\n",
    "    for sub_Exp in sub_Exps:\n",
    "        tokens.append(sub_Expression_to_label(sub_Exp,mapping))\n",
    "    return tokens\n",
    "\n",
    "def expression_to_text(sub_exps, xx=None, yy=None):\n",
    "    if xx is None or yy is None or len(xx)==0 or len(yy)==0:\n",
    "        xx = [' followed by', ' ends with', ' before', ' and later', ' and then']\n",
    "        yy = [' overlayed by', ' and', ' with', ' accompanied by', ' surrounded by', ' amidst by']\n",
    "    sub_exps_text = []\n",
    "    for sub_exp in sub_exps:\n",
    "        temp = ''\n",
    "        for i in sub_exp:\n",
    "            if i in '()':\n",
    "                continue\n",
    "            elif i=='+':\n",
    "                temp += random.choice(xx)\n",
    "            elif i=='*':\n",
    "                temp += random.choice(yy)\n",
    "            else:\n",
    "                temp += ' ' + i\n",
    "        sub_exps_text.append(temp.strip())\n",
    "    return sub_exps_text\n",
    "\n",
    "def all_possible_texts(exp):\n",
    "    all_possible = []\n",
    "    for xx in [' followed by', ' ends with', ' before', ' and later', ' and then', \" proceeded by\", \" succeeded by\", ]:\n",
    "        for yy in [' overlayed by', ' and', ' with', ' accompanied by', ' surrounded by', ' amidst by', \" coupled with\" ]:\n",
    "            temp = ''\n",
    "            for i in exp:\n",
    "                if i in '()':\n",
    "                    continue\n",
    "                elif i=='+':\n",
    "                    temp += xx\n",
    "                elif i=='*':\n",
    "                    temp += yy\n",
    "                else:\n",
    "                    temp += ' ' + i\n",
    "            all_possible.append([temp.strip(),xx,yy])\n",
    "    # return random.sample(all_possible, 25)\n",
    "    return all_possible\n",
    "\n",
    "def reverse_it(word, ind):\n",
    "\n",
    "    if word[ind+1] not in '()':\n",
    "        right_string = word[ind+1]\n",
    "    else:\n",
    "        right_string = '('\n",
    "        count = 1\n",
    "        temp = ind+2\n",
    "        while(count):\n",
    "            right_string+=word[temp]\n",
    "            if word[temp] not in '()':\n",
    "                temp+=1\n",
    "                continue\n",
    "            if word[temp]=='(':\n",
    "                count+=1\n",
    "            else:\n",
    "                count-=1\n",
    "            temp+=1\n",
    "    if word[ind-1] not in '()':\n",
    "        left_string = word[ind-1]\n",
    "    else:\n",
    "        left_string = ')'\n",
    "        count = 1\n",
    "        temp = ind-2\n",
    "        while(count):\n",
    "            left_string=word[temp] + left_string\n",
    "            if word[temp] not in '()':\n",
    "                temp-=1\n",
    "                continue\n",
    "            if word[temp]==')':\n",
    "                count+=1\n",
    "            else:\n",
    "                count-=1\n",
    "            temp-=1\n",
    "\n",
    "    res1 = ''.join(random.choices(string.digits, k=10))\n",
    "    res2 = ''.join(random.choices(string.digits, k=10))\n",
    "    word = word.replace(left_string, res1)\n",
    "    word = word.replace(right_string, res2)\n",
    "    word = word.replace(res2, left_string)\n",
    "    word = word.replace(res1, right_string)\n",
    "    # print(word)\n",
    "    return word\n",
    "    \n",
    "def findall(a, b):\n",
    "    save = []\n",
    "    for xxx in range(len(a)):\n",
    "        if a[xxx]==b:\n",
    "            save.append(xxx)\n",
    "    return save\n",
    "\n",
    "from copy import deepcopy as dc\n",
    "    \n",
    "def make_negatives(exps):\n",
    "    all_negs = []\n",
    "    for i in range(len(exps)):\n",
    "        findall_temp1 = findall(exps[i], '+')\n",
    "        findall_temp2 = findall(exps[i], '*')\n",
    "\n",
    "        for j in findall_temp1:\n",
    "            temp = dc(exps[i])\n",
    "            temp = temp[:j] + '*' + temp[j+1:]\n",
    "            # temp[j] = '*'\n",
    "            all_negs.append(temp)\n",
    "\n",
    "        for j in findall_temp2:\n",
    "            temp = dc(exps[i])\n",
    "            temp = temp[:j] + '+' + temp[j+1:]\n",
    "            all_negs.append(temp)\n",
    "            \n",
    "        \n",
    "        if '+' in exps[i]:\n",
    "            ind = []\n",
    "            for xx in range(len(exps[i])):\n",
    "                if exps[i][xx]=='+':\n",
    "                    ind.append(xx)\n",
    "            for j in ind:\n",
    "                all_negs.append(reverse_it(exps[i],j))\n",
    "                \n",
    "    return list(set(all_negs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "138b5c60-1f1d-4736-b687-d8367ef00503",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import yaml\n",
    "a = pd.read_csv('final_audio (1).csv')\n",
    "a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c07ccbfe-1c47-4ca3-933d-b9ba799e8983",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_exp = list(set(a['sub_exp'].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5792f1b4-7f83-4cfe-a630-66db995eb410",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(all_exp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "579dd3f5-1564-494c-8646-12b263a25f27",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f11d670-b1ac-4e53-970a-4bdc8b5e2435",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('global_sound (1).yml', 'r') as file:\n",
    "    dict_temp = yaml.safe_load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a46dedf8-bbb4-4926-926a-a0df9b5c48d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_labels = []\n",
    "for i in dict_temp.keys():\n",
    "    all_labels += dict_temp[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9dec4ca0-fd21-4f2d-943f-b0e5d1d7c87b",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_labels.sort(key=len, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c344403a-de80-4618-9d24-9391522ca62b",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48dc3763-4c86-472e-a082-2a8569a3ba7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01767e25-8560-499d-b4e3-fb5c1d4f25b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(all_exp)):\n",
    "    temp = {}\n",
    "    for j in all_labels:\n",
    "        if j in all_exp[i]:\n",
    "            res = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))\n",
    "            temp[res] = j\n",
    "            all_exp[i] = all_exp[i].replace(j, res)\n",
    "\n",
    "    for j in temp.keys():\n",
    "        all_exp[i] = all_exp[i].replace(j, \"'\"+temp[j]+\"'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "207df513-36c5-48ec-a50d-3c3e6fff035b",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(all_exp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60c9d301-d0c3-40be-9f9a-6890927d2a30",
   "metadata": {},
   "outputs": [],
   "source": [
    "final = {}\n",
    "for i in all_exp:\n",
    "    # print('expression: ', i)\n",
    "    s, reverse_replacement_map = labels_To_Expression(i)\n",
    "    obj = Conversion(len(s))\n",
    "    s = obj.infixToPostfix(s)\n",
    "\n",
    "    stack = Stack()\n",
    "    for c in s:\n",
    "        if c in \"+-*/^\":\n",
    "            z = Node(c)\n",
    "            x = stack.pop()\n",
    "            y = stack.pop()\n",
    "            z.left = y\n",
    "            z.right = x\n",
    "            stack.push(z)\n",
    "        else:\n",
    "            stack.push(Node(c))\n",
    "    tree = ExpressionTree()\n",
    "    sub_exps = list(set(tree.get_subexpressions(stack.pop())))\n",
    "    sub_exps.sort(key=len, reverse=True)\n",
    "\n",
    "    caption = sub_exps[0]\n",
    "    if 'A' not in caption or 'B' not in caption:\n",
    "        continue\n",
    "    positives = sub_exps[1:]\n",
    "    if caption=='(A*B)':\n",
    "        negatives = ['B+A']\n",
    "        positives.append('B*A')\n",
    "    else:\n",
    "        negatives = make_negatives(sub_exps)\n",
    "        negatives = [nn for nn in negatives if nn is not caption]\n",
    "\n",
    "    if len(caption)>5:\n",
    "        positives = [xx for xx in positives if len(xx)!=1]\n",
    "\n",
    "    # print('Caption: ', caption)\n",
    "    # print('Positives: ', positives)\n",
    "    # print('Negatives: ',negatives)\n",
    "    # print()\n",
    "\n",
    "    final[i] = {}\n",
    "    final[i]['captions'] = {}\n",
    "\n",
    "    for c in all_possible_texts(caption):\n",
    "        new_cap = sub_Expression_to_label(c[0], reverse_replacement_map)\n",
    "        final[i]['captions'][new_cap] = {}\n",
    "        final[i]['captions'][new_cap]['positives'] = sub_Expressions_to_labels(expression_to_text(positives, [c[1]],[c[2]]),reverse_replacement_map)\n",
    "        final[i]['captions'][new_cap]['negatives'] = sub_Expressions_to_labels(expression_to_text(negatives, [c[1]],[c[2]]),reverse_replacement_map)\n",
    "\n",
    "    # print(json.dumps(final,sort_keys=True, indent=4))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f88b003-e55a-4a9a-9dc2-22b7cf74dba2",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('final.json', 'w') as outfile:\n",
    "    json.dump(final, outfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efaf6462-cc94-4102-9035-940ab5768a86",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0506169c-7a58-412e-974e-67edfc028f39",
   "metadata": {},
   "outputs": [],
   "source": [
    "saving = []\n",
    "new_exps = list(set(a['sub_exp'].values))\n",
    "for xx in tqdm(range(len(new_exps))):\n",
    "    corresponding = all_exp[xx]\n",
    "    if corresponding in final.keys():\n",
    "        new_a = a[a['sub_exp']==new_exps[xx]].values\n",
    "        test = random.sample(list(range(len(new_a))), min(len(final[corresponding]['captions'].keys()), len(new_a)))\n",
    "        test = new_a[test]\n",
    "        for yy in range(len(test)):\n",
    "            caption = list(final[corresponding]['captions'].keys())[yy]\n",
    "            positives = final[corresponding]['captions'][caption]['positives'][:5]\n",
    "            negatives = final[corresponding]['captions'][caption]['negatives'][:5]\n",
    "            t = ';'.join([caption] + positives + negatives)\n",
    "            saving.append([test[yy][0], t, len(positives), len(negatives), \"synthetic\", \"train\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "069af6da-ddc7-411f-890a-7684ce763603",
   "metadata": {},
   "outputs": [],
   "source": [
    "saving"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ad18d69-8a15-4dad-9924-e2770173a58d",
   "metadata": {},
   "outputs": [],
   "source": [
    "saving_df = pd.DataFrame(saving, columns=['path', 'caption', 'num_of_pos', 'num_of_neg', 'dataset', 'split_name'])\n",
    "saving_df.to_csv('final_new.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91220bbf-807f-4e23-acfa-cd29d67620cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(saving)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ef2bbcf-852a-41ea-9da2-356e238e918d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
