{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from shutil import copy\n",
    "import librosa\n",
    "import numpy as np\n",
    "import json\n",
    "import wget\n",
    "import requests\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "108317\n",
      "[{'id': 'Yb0RFKhbpFJA.wav', 'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.', 'audio': 'wav_path', 'duration': 10.0}, {'id': 'YNQNTnl0zaqU.wav', 'caption': 'Objects are hit repeatedly while a cash register opens and a power tool is used with a train horn in the background.', 'audio': 'wav_path', 'duration': 10.0}, {'id': 'Y4PPmyY_-YrA.wav', 'caption': 'A hair dryer is heard with ticking.', 'audio': 'wav_path', 'duration': 10.0}, {'id': 'YLvNUyQ3xuAQ.wav', 'caption': 'A dial tone is heard.', 'audio': 'wav_path', 'duration': 6.22296875}, {'id': 'YXMl9lI7mKsM.wav', 'caption': 'Men are singing, shouting, and music can be heard.', 'audio': 'wav_path', 'duration': 10.0}]\n"
     ]
    }
   ],
   "source": [
    "audioset_sl_json_file = \"/home/v-xxxx/DiffAudioImg/WavCaps/data/json_files/AudioSet_SL/as_final.json\"\n",
    "with open(audioset_sl_json_file, \"r\") as f:\n",
    "    audioset_sl_infos = json.load(f)\n",
    "audioset_sl_infos = audioset_sl_infos['data']\n",
    "print(len(audioset_sl_infos))\n",
    "print(audioset_sl_infos[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "audioset_sl_ids = set()\n",
    "audioset_sl_captions = {}\n",
    "for info in audioset_sl_infos:\n",
    "    audioset_sl_ids.add(info[\"id\"])\n",
    "    audioset_sl_captions[info[\"id\"][1:]] = info[\"caption\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "audioset_path = \"/blob/v-xxxx/speech/audioset\"\n",
    "audioset_sl_paths = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "as_meta_path = \"/blob/v-xxxx/speech/audioset_full_split_meta\"\n",
    "audioset_full_split_metas = os.listdir(as_meta_path)\n",
    "for meta in audioset_full_split_metas[:]:\n",
    "    with open(os.path.join(as_meta_path, meta), \"r\") as f:\n",
    "        lines = f.readlines()\n",
    "    for line in lines:\n",
    "        wav_path = line.split(\"\\t\")[0]\n",
    "        wav_id = wav_path.split(\"/\")[-1]\n",
    "        # print(wav_id)\n",
    "        if \"Y\"+wav_id in audioset_sl_ids:\n",
    "            audioset_sl_paths[wav_id] = wav_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "79147\n"
     ]
    }
   ],
   "source": [
    "print(len(audioset_sl_paths))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8UnzSTvPvE8.wav A helicopter propeller makes a lot of noise.\n",
      "JGHiZqmryk8.wav People laugh, breathe, burp, and speak.\n",
      "kycvGRgwAhU.wav People converse and mechanisms sound.\n",
      "M3GLUcIzQ_o.wav Wind noise, breathing, and vehicle horns can be heard in a soundscape with ticking.\n",
      "6CSiXm1CEAY.wav Music plays with breathing.\n",
      "Hiv3mRqX5Mo.wav Wind noise is heard.\n",
      "pWg2QDjgijc.wav Music is playing.\n",
      "HKxjlSOh7N0.wav A speech synthesizer and video game sounds lead to a boing.\n",
      "AmComh5kkuA.wav A man speaking, music, and more men speaking are heard.\n",
      "MYZPAZ6DqbQ.wav Women speak while whip cracks, wind blows, and birds chirp intermittently.\n"
     ]
    }
   ],
   "source": [
    "for wav_id in list(audioset_sl_paths.keys())[:10]:\n",
    "    print(wav_id, audioset_sl_captions[wav_id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 43/79147 [00:05<3:29:10,  6.30it/s]"
     ]
    }
   ],
   "source": [
    "for wav_id in tqdm(list(audioset_sl_paths.keys())[:]):\n",
    "    try:\n",
    "        copy(os.path.join(audioset_path, audioset_sl_paths[wav_id]),\n",
    "         os.path.join(\"/blob/v-xxxx/WavCaps/AudioSet_SL/wav\", wav_id))\n",
    "    except:\n",
    "        continue"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "control",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
