# --------------------- laion ---------------------

# Download laion-art dataset given metadata is downloaded
# cd laion-art

img2dataset --url_list laion-art-en --input_format "parquet"\
         --url_col "URL" --caption_col "TEXT" --output_format webdataset\
           --output_folder laion-high-resolution-en --processes_count 16 --thread_count 64 --image_size 224 --min_image_size 128 --max_aspect_ratio 1.5\
            --resize_only_if_bigger=True --resize_mode="keep_ratio" --skip_reencode=True \
             --save_additional_columns '["similarity","hash","punsafe","pwatermark","aesthetic","LANGUAGE"]' --enable_wandb True

# This step is for us to get the prior dataloader.
# Get the clip embedding of these images.
# cd laion-high-resolution-en

clip-retrieval inference --input_dataset  "{00000..328}.tar" --output_folder clip_emb --input_format "webdataset" --enable_metadata True --clip_model ViT-L/14 --enable_wandb True


# This step is for us to get the decoder dataloader.
# Reorder the image embeddings to pair with the raw image data
# cd laion-high-resolution-en/clip_emb/metadata
reorder-embeddings example-key --metadata-folder .
# This shows our shard string width is 5, index string width is 4.

# cd laion-high-resolution-en/clip_emb
mkdir img_emb_reorder

reorder-embeddings reorder --embeddings-folder img_emb --metadata-folder metadata --output-shard-width 5 --index-width 4 --output-folder img_emb_reorder --verbose True --limit 330

mkdir text_emb_reorder

reorder-embeddings reorder --embeddings-folder text_emb --metadata-folder metadata --output-shard-width 5 --index-width 4 --output-folder text_emb_reorder --verbose True --limit 330

# --------------------- cc3m_no_watermark ---------------------

# Download cc3m dataset
# follow instructions here https://github.com/rom1504/img2dataset/blob/main/dataset_examples/cc3m.md
# When downloading tsv file, need to copy address and then put it in search box.
# About 2h~4h to download.
img2dataset --url_list cc3m_no_watermark.tsv --input_format "tsv"\
         --url_col "url" --caption_col "caption" --output_format webdataset\
          --output_folder cc3m_no_watermark --processes_count 16 --thread_count 64 --image_size 224\
          --min_image_size 128 --max_aspect_ratio 1.5\
          --resize_only_if_bigger=True --resize_mode="keep_ratio" --skip_reencode=True --enable_wandb True

# This step is for us to get the prior dataloader.
# Get the clip embedding of these images. # take 2h
# cd cc3m_no_watermark
clip-retrieval inference --input_dataset  "{00000..167}.tar" --output_folder clip_emb --input_format "webdataset" --enable_metadata True --clip_model ViT-L/14 --enable_wandb True

# cd clip_emb # take 30s
mkdir img_emb_reorder && mkdir text_emb_reorder

reorder-embeddings reorder --embeddings-folder img_emb --metadata-folder metadata --output-shard-width 5 --index-width 4 --output-folder img_emb_reorder --verbose True --limit 167

reorder-embeddings reorder --embeddings-folder text_emb --metadata-folder metadata --output-shard-width 5 --index-width 4 --output-folder text_emb_reorder --verbose True --limit 167
