{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image generation", "api_name": "stabilityai/stable-diffusion-2-base", "api_call": "StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-base', scheduler=EulerDiscreteScheduler.from_pretrained('stabilityai/stable-diffusion-2-base', subfolder=scheduler), torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion v2-base is a diffusion-based text-to-image generation model trained on a subset of LAION-5B dataset. It can be used to generate and modify images based on text prompts. The model uses a fixed, pretrained text encoder (OpenCLIP-ViT/H) and is intended for research purposes only.", "model_name": "stabilityai/stable-diffusion-2-base"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221122-014502", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221122-014502')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.3476, "Mae": 0.2763, "Rmse": 0.4088, "Abs Rel": 0.33080000000000004, "Log Mae": 0.11610000000000001, "Log Rmse": 0.17, "Delta1": 0.5682, "Delta2": 0.8301000000000001, "Delta3": 0.9279000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset. It achieves depth estimation with various performance metrics.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221122-014502"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "emotion", "api_name": "bhadresh-savani/distilbert-base-uncased-emotion", "api_call": "pipeline('text-classification', model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)", "performance": {"dataset": "Twitter-Sentiment-Analysis", "accuracy": 0.9380000000000001}, "description": "Distilbert is created with knowledge distillation during the pre-training phase which reduces the size of a BERT model by 40%, while retaining 97% of its language understanding. It's smaller, faster than Bert and any other Bert-based model. Distilbert-base-uncased finetuned on the emotion dataset using HuggingFace Trainer.", "model_name": "bhadresh-savani/distilbert-base-uncased-emotion"}
{"domain": "Audio Text-to-Speech", "framework": "speechbrain", "functionality": "Text-to-Speech", "api_name": "padmalcom/tts-tacotron2-german", "api_call": "Tacotron2.from_hparams(source='padmalcom/tts-tacotron2-german')", "performance": {"dataset": "custom german dataset", "accuracy": "Not provided"}, "description": "Text-to-Speech (TTS) with Tacotron2 trained on a custom german dataset with 12 days voice using speechbrain. Trained for 39 epochs (english speechbrain models are trained for 750 epochs) so there is room for improvement and the model is most likely to be updated soon. The hifigan vocoder can fortunately be used language-independently.", "model_name": "padmalcom/tts-tacotron2-german"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "MFawad/sd-class-butterflies-32", "api_call": "DDPMPipeline.from_pretrained('MFawad/sd-class-butterflies-32')", "performance": {"dataset": "", "accuracy": ""}, "description": "This model is a diffusion model for unconditional image generation of cute \ud83e\udd8b.", "model_name": "MFawad/sd-class-butterflies-32"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10", "api_call": "unit.TTS.from_pretrained('facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10')", "performance": {"dataset": "covost2", "accuracy": null}, "description": "A text-to-speech model trained on multiple datasets including mtedx, covost2, europarl_st, and voxpopuli. Supports English, Spanish, French, and Italian languages.", "model_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "layoutlm-invoices", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('impira/layoutlm-invoices')", "performance": {"dataset": "proprietary dataset of invoices, SQuAD2.0, and DocVQA", "accuracy": "Not provided"}, "description": "A fine-tuned version of the multi-modal LayoutLM model for the task of question answering on invoices and other documents. It has been fine-tuned on a proprietary dataset of invoices as well as both SQuAD2.0 and DocVQA for general comprehension. Unlike other QA models, which can only extract consecutive tokens, this model can predict longer-range, non-consecutive sequences with an additional classifier head.", "model_name": "impira/layoutlm-invoices"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "azwierzc/vilt-b32-finetuned-vqa-pl", "api_call": "pipeline('visual-question-answering', model='azwierzc/vilt-b32-finetuned-vqa-pl')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Visual Question Answering model fine-tuned on the Polish language.", "model_name": "azwierzc/vilt-b32-finetuned-vqa-pl"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/maskformer-swin-base-ade", "api_call": "MaskFormerForInstanceSegmentation.from_pretrained('facebook/maskformer-swin-base-ade')", "performance": {"dataset": "ADE20k", "accuracy": "Not provided"}, "description": "MaskFormer model trained on ADE20k semantic segmentation (base-sized version, Swin backbone). It was introduced in the paper Per-Pixel Classification is Not All You Need for Semantic Segmentation and first released in this repository. This model addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. Hence, all 3 tasks are treated as if they were instance segmentation.", "model_name": "facebook/maskformer-swin-base-ade"}
{"domain": "Tabular Tabular Classification", "framework": "Hugging Face Transformers", "functionality": "Tabular Classification", "api_name": "datadmg/autotrain-test-news-44534112235", "api_call": "AutoModel.from_pretrained('datadmg/autotrain-test-news-44534112235')", "performance": {"dataset": "datadmg/autotrain-data-test-news", "accuracy": 0.333}, "description": "This model is trained for Multi-class Classification on CO2 Emissions dataset. It uses the Hugging Face Transformers framework and is based on the extra_trees algorithm. The model is trained with AutoTrain and has a tabular classification functionality.", "model_name": "datadmg/autotrain-test-news-44534112235"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "speech-to-speech-translation", "api_name": "facebook/xm_transformer_sm_all-en", "api_call": "pipeline('translation', model='facebook/xm_transformer_sm_all-en')", "performance": {"dataset": "", "accuracy": ""}, "description": "A speech-to-speech translation model that can be loaded on the Inference API on-demand.", "model_name": "facebook/xm_transformer_sm_all-en"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Text Classification", "api_name": "lvwerra/distilbert-imdb", "api_call": "pipeline('sentiment-analysis', model='lvwerra/distilbert-imdb')", "performance": {"dataset": "imdb", "accuracy": 0.928}, "description": "This model is a fine-tuned version of distilbert-base-uncased on the imdb dataset. It is used for sentiment analysis on movie reviews and achieves an accuracy of 0.928 on the evaluation set.", "model_name": "lvwerra/distilbert-imdb"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-ViT-B-16-laion2B-s34B-b88K", "api_call": "pipeline('image-classification', model='laion/CLIP-ViT-B-16-laion2B-s34B-b88K')", "performance": {"dataset": "ImageNet-1k", "accuracy": "70.2%"}, "description": "A CLIP ViT-B/16 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP. This model is intended for research purposes and can be used for zero-shot image classification, image and text retrieval, and other related tasks.", "model_name": "laion/CLIP-ViT-B-16-laion2B-s34B-b88K"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image-to-Image", "api_name": "GreeneryScenery/SheepsControlV3", "api_call": "pipeline('image-to-image', model='GreeneryScenery/SheepsControlV3')", "performance": {"dataset": "GreeneryScenery/SheepsControlV3", "accuracy": "Not provided"}, "description": "GreeneryScenery/SheepsControlV3 is a model for image-to-image tasks. It can be used to generate images based on the input image and optional text guidance. The model has some limitations, such as the conditioning image not affecting the output image much. Improvements can be made by training for more epochs, using better prompts, and preprocessing the data.", "model_name": "GreeneryScenery/SheepsControlV3"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "seungwon12/layoutlmv2-base-uncased_finetuned_docvqa", "api_call": "pipeline('question-answering', model='seungwon12/layoutlmv2-base-uncased_finetuned_docvqa', tokenizer='seungwon12/layoutlmv2-base-uncased_finetuned_docvqa')", "performance": {"dataset": "DocVQA", "accuracy": ""}, "description": "A document question answering model finetuned on the DocVQA dataset using LayoutLMv2-base-uncased.", "model_name": "seungwon12/layoutlmv2-base-uncased_finetuned_docvqa"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-base", "api_call": "VideoMAEForPreTraining.from_pretrained('MCG-NJU/videomae-base')", "performance": {"dataset": "Kinetics-400", "accuracy": "To be provided"}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches.", "model_name": "MCG-NJU/videomae-base"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "distilbert-base-uncased", "api_call": "pipeline('fill-mask', model='distilbert-base-uncased')", "performance": {"dataset": "GLUE", "accuracy": {"MNLI": 82.2, "QQP": 88.5, "QNLI": 89.2, "SST-2": 91.3, "CoLA": 51.3, "STS-B": 85.8, "MRPC": 87.5, "RTE": 59.9}}, "description": "DistilBERT is a transformers model, smaller and faster than BERT, which was pretrained on the same corpus in a self-supervised fashion, using the BERT base model as a teacher. It was pretrained with three objectives: Distillation loss, Masked language modeling (MLM), and Cosine embedding loss. This model is uncased and can be used for masked language modeling or next sentence prediction, but it's mostly intended to be fine-tuned on a downstream task.", "model_name": "distilbert-base-uncased"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "clip-vit-base-patch32-ko", "api_call": "pipeline('zero-shot-image-classification', model='Bingsu/clip-vit-base-patch32-ko')", "performance": {"dataset": "AIHUB", "accuracy": "Not provided"}, "description": "Korean CLIP model trained by Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation. It is a zero-shot image classification model that can be used to classify images without any training data.", "model_name": "Bingsu/clip-vit-base-patch32-ko"}
{"domain": "Natural Language Processing Token Classification", "framework": "Flair", "functionality": "Part-of-Speech Tagging", "api_name": "flair/upos-english", "api_call": "SequenceTagger.load('flair/upos-english')", "performance": {"dataset": "ontonotes", "accuracy": 98.6}, "description": "This is the standard universal part-of-speech tagging model for English that ships with Flair. It predicts universal POS tags such as ADJ, ADP, ADV, AUX, CCONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, and X. The model is based on Flair embeddings and LSTM-CRF.", "model_name": "flair/upos-english"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Human Pose Estimation", "api_name": "lllyasviel/sd-controlnet-openpose", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-openpose')", "performance": {"dataset": "200k pose-image, caption pairs", "accuracy": "Not specified"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Human Pose Estimation. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-openpose"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "kazusam/kt", "api_call": "./run.sh --skip_data_prep false --skip_train true --download_model mio/amadeus", "performance": {"dataset": "amadeus", "accuracy": "Not provided"}, "description": "An ESPnet2 TTS model trained by mio using amadeus recipe in espnet.", "model_name": "mio/amadeus"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "De-identification", "api_name": "StanfordAIMI/stanford-deidentifier-base", "api_call": "pipeline('ner', model='StanfordAIMI/stanford-deidentifier-base')", "performance": {"dataset": "radreports", "accuracy": {"known_institution_F1": 97.9, "new_institution_F1": 99.6, "i2b2_2006_F1": 99.5, "i2b2_2014_F1": 98.9}}, "description": "Stanford de-identifier was trained on a variety of radiology and biomedical documents with the goal of automatising the de-identification process while reaching satisfactory accuracy for use in production.", "model_name": "StanfordAIMI/stanford-deidentifier-base"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "LunarLander-v2", "api_name": "araffin/dqn-LunarLander-v2", "api_call": "DQN.load(load_from_hub('araffin/dqn-LunarLander-v2', 'dqn-LunarLander-v2.zip'), **kwargs)", "performance": {"dataset": "LunarLander-v2", "accuracy": "280.22 +/- 13.03"}, "description": "This is a trained model of a DQN agent playing LunarLander-v2 using the stable-baselines3 library.", "model_name": "araffin/dqn-LunarLander-v2"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Multilingual Question Answering", "api_name": "mrm8488/bert-multi-cased-finetuned-xquadv1", "api_call": "pipeline('question-answering', model='mrm8488/bert-multi-cased-finetuned-xquadv1', tokenizer='mrm8488/bert-multi-cased-finetuned-xquadv1')", "performance": {"dataset": "XQuAD", "accuracy": "Not provided"}, "description": "This model is a BERT (base-multilingual-cased) fine-tuned for multilingual Question Answering on 11 different languages using the XQuAD dataset and additional data augmentation techniques.", "model_name": "mrm8488/bert-multi-cased-finetuned-xquadv1"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english", "api_call": "SequenceTagger.load('flair/ner-english')", "performance": {"dataset": "conll2003", "accuracy": "93.06"}, "description": "This is the standard 4-class NER model for English that ships with Flair. It predicts 4 tags: PER (person name), LOC (location name), ORG (organization name), and MISC (other name). The model is based on Flair embeddings and LSTM-CRF.", "model_name": "flair/ner-english"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "shibing624/text2vec-base-chinese", "api_call": "SentenceModel('shibing624/text2vec-base-chinese')", "performance": {"dataset": [{"name": "ATEC", "accuracy": "31.93"}, {"name": "BQ", "accuracy": "42.67"}, {"name": "LCQMC", "accuracy": "70.16"}, {"name": "PAWSX", "accuracy": "17.21"}, {"name": "STS-B", "accuracy": "79.30"}]}, "description": "This is a CoSENT(Cosine Sentence) model: shibing624/text2vec-base-chinese. It maps sentences to a 768 dimensional dense vector space and can be used for tasks like sentence embeddings, text matching or semantic search.", "model_name": "shibing624/text2vec-base-chinese"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "Linaqruf/anything-v3.0", "api_call": "Text2ImagePipeline(model='Linaqruf/anything-v3.0')", "performance": {"dataset": "", "accuracy": ""}, "description": "A text-to-image model that generates images from text descriptions.", "model_name": "Linaqruf/anything-v3.0"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "MountainCar-v0", "api_name": "sb3/dqn-MountainCar-v0", "api_call": "load_from_hub(repo_id='sb3/dqn-MountainCar-v0',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "MountainCar-v0", "accuracy": "-103.40 +/- 7.49"}, "description": "This is a trained model of a DQN agent playing MountainCar-v0 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/dqn-MountainCar-v0"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "af1tang/personaGPT", "api_call": "AutoModelForCausalLM.from_pretrained('af1tang/personaGPT')", "performance": {"dataset": "Persona-Chat", "accuracy": "Not provided"}, "description": "PersonaGPT is an open-domain conversational agent designed to do 2 tasks: decoding personalized responses based on input personality facts (the persona profile of the bot) and incorporating turn-level goals into its responses through action codes (e.g., talk about work, ask about favorite music). It builds on the DialoGPT-medium pretrained model based on the GPT-2 architecture.", "model_name": "af1tang/personaGPT"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "lmazzon70/videomae-large-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kl-torch2", "api_call": "AutoModelForVideoClassification.from_pretrained('lmazzon70/videomae-large-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kl-torch2')", "performance": {"dataset": "unknown", "accuracy": 0.7212000000000001}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-large-finetuned-kinetics on an unknown dataset.", "model_name": "lmazzon70/videomae-large-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kl-torch2"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-base", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-base')", "performance": {"dataset": "arxiv:2107.07653", "accuracy": "Not provided"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries.", "model_name": "microsoft/tapex-base"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup')", "performance": {"dataset": "ImageNet-1k", "accuracy": "79.1-79.4"}, "description": "A series of CLIP ConvNeXt-XXLarge models trained on LAION-2B (English), a subset of LAION-5B, using OpenCLIP. These models achieve between 79.1 and 79.4 top-1 zero-shot accuracy on ImageNet-1k.", "model_name": "laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Automatic Speech Recognition and Speech Translation", "api_name": "openai/whisper-large-v2", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-large-v2')", "performance": {"dataset": "LibriSpeech test-clean", "accuracy": 3.000358308}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalize to many datasets and domains without the need for fine-tuning.", "model_name": "openai/whisper-large-v2"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8s-building-segmentation", "api_call": "YOLO('keremberke/yolov8s-building-segmentation')", "performance": {"dataset": "satellite-building-segmentation", "accuracy": {"mAP@0.5(box)": 0.661, "mAP@0.5(mask)": 0.651}}, "description": "A YOLOv8 model for building segmentation in satellite images. Trained on the satellite-building-segmentation dataset, it can detect and segment buildings with high accuracy.", "model_name": "keremberke/yolov8s-building-segmentation"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "google/vit-base-patch16-224", "api_call": "ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 224x224. It was introduced in the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale by Dosovitskiy et al.", "model_name": "google/vit-base-patch16-224"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "facebook/detr-resnet-50-panoptic", "api_call": "DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic')", "performance": {"dataset": "COCO 2017 validation", "accuracy": {"box_AP": 38.8, "segmentation_AP": 31.1, "PQ": 43.4}}, "description": "DEtection TRansformer (DETR) model trained end-to-end on COCO 2017 panoptic (118k annotated images). It was introduced in the paper End-to-End Object Detection with Transformers by Carion et al. and first released in this repository.", "model_name": "facebook/detr-resnet-50-panoptic"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling, Next Sentence Prediction", "api_name": "bert-base-uncased", "api_call": "pipeline('fill-mask', model='bert-base-uncased')", "performance": {"dataset": "GLUE", "accuracy": 79.6}, "description": "BERT base model (uncased) is a transformer model pretrained on a large corpus of English data using a masked language modeling (MLM) objective. It can be used for masked language modeling, next sentence prediction, and fine-tuning on downstream tasks such as sequence classification, token classification, or question answering.", "model_name": "bert-base-uncased"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "LayoutLMX_pt_question_answer_ocrazure_correct_V18_08_04_2023", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V18_08_04_2023')", "performance": {"dataset": "", "accuracy": ""}, "description": "A LayoutLM model for document question answering.", "model_name": "L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V18_08_04_2023"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "hf-tiny-model-private/tiny-random-GLPNForDepthEstimation", "api_call": "AutoModel.from_pretrained('hf-tiny-model-private/tiny-random-GLPNForDepthEstimation')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random GLPN model for depth estimation using the Hugging Face Transformers library.", "model_name": "hf-tiny-model-private/tiny-random-GLPNForDepthEstimation"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text-to-Text Transfer Transformer", "api_name": "google/mt5-base", "api_call": "MT5ForConditionalGeneration.from_pretrained('google/mt5-base')", "performance": {"dataset": "mc4", "accuracy": "Not provided"}, "description": "mT5 is a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. It leverages a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of multilingual NLP tasks.", "model_name": "google/mt5-base"}
{"domain": "Computer Vision Object Detection", "framework": "Transformers", "functionality": "Object Detection", "api_name": "fcakyon/yolov5s-v7.0", "api_call": "yolov5.load('fcakyon/yolov5s-v7.0')", "performance": {"dataset": "detection-datasets/coco", "accuracy": null}, "description": "Yolov5s-v7.0 is an object detection model trained on the COCO dataset. It can detect objects in images and return their bounding boxes, scores, and categories.", "model_name": "fcakyon/yolov5s-v7.0"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text2Text Generation", "api_name": "DialogLED-base-16384", "api_call": "LEDForConditionalGeneration.from_pretrained('MingZhong/DialogLED-base-16384')", "performance": {"dataset": "arxiv", "accuracy": "2109.02492"}, "description": "DialogLED is a pre-trained model for long dialogue understanding and summarization. It builds on the Longformer-Encoder-Decoder (LED) architecture and uses window-based denoising as the pre-training task on a large amount of long dialogue data for further training. Here is a base version of DialogLED, the input length is limited to 16,384 in the pre-training phase.", "model_name": "MingZhong/DialogLED-base-16384"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "unit_hifigan_HK_layer12.km2500_frame_TAT-TTS", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/unit_hifigan_HK_layer12.km2500_frame_TAT-TT')", "performance": {"dataset": "TAT-TTS", "accuracy": "Not provided"}, "description": "Hokkien unit HiFiGAN based vocoder from fairseq. Trained with TAT-TTS data with 4 speakers in Taiwanese Hokkien accent.", "model_name": "facebook/unit_hifigan_HK_layer12.km2500_frame_TAT-TT"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "openai/clip-vit-base-patch32", "api_call": "CLIPModel.from_pretrained('openai/clip-vit-base-patch32')", "performance": {"dataset": ["Food101", "CIFAR10", "CIFAR100", "Birdsnap", "SUN397", "Stanford Cars", "FGVC Aircraft", "VOC2007", "DTD", "Oxford-IIIT Pet dataset", "Caltech101", "Flowers102", "MNIST", "SVHN", "IIIT5K", "Hateful Memes", "SST-2", "UCF101", "Kinetics700", "Country211", "CLEVR Counting", "KITTI Distance", "STL-10", "RareAct", "Flickr30", "MSCOCO", "ImageNet", "ImageNet-A", "ImageNet-R", "ImageNet Sketch", "ObjectNet (ImageNet Overlap)", "Youtube-BB", "ImageNet-Vid"], "accuracy": "varies"}, "description": "The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner.", "model_name": "openai/clip-vit-base-patch32"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "mywateriswet/ShuanBot", "api_call": "pipeline('conversational', model='mywateriswet/ShuanBot')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "ShuanBot is a conversational chatbot model based on the GPT-2 architecture. It can be used for generating human-like responses in a chat context.", "model_name": "mywateriswet/ShuanBot"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-kitti", "api_call": "GLPNForDepthEstimation.from_pretrained('vinvino02/glpn-kitti')", "performance": {"dataset": "KITTI", "accuracy": "Not provided"}, "description": "Global-Local Path Networks (GLPN) model trained on KITTI for monocular depth estimation. It was introduced in the paper Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth by Kim et al. and first released in this repository.", "model_name": "vinvino02/glpn-kitti"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Abstractive Text Summarization", "api_name": "plguillou/t5-base-fr-sum-cnndm", "api_call": "T5ForConditionalGeneration.from_pretrained('plguillou/t5-base-fr-sum-cnndm')", "performance": {"dataset": "cnn_dailymail", "ROUGE-1": 44.5252, "ROUGE-2": 22.652, "ROUGE-L": 29.8866}, "description": "This model is a T5 Transformers model (JDBN/t5-base-fr-qg-fquad) that was fine-tuned in French for abstractive text summarization.", "model_name": "plguillou/t5-base-fr-sum-cnndm"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tiny-random-DPTForDepthEstimation", "api_call": "DPTForDepthEstimation.from_pretrained('hf-tiny-model-private/tiny-random-DPTForDepthEstimation')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random DPT model for depth estimation using Hugging Face Transformers library.", "model_name": "hf-tiny-model-private/tiny-random-DPTForDepthEstimation"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-large-finetuned-kinetics", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-large-finetuned-kinetics')", "performance": {"dataset": "Kinetics-400", "accuracy": {"top-1": 84.7, "top-5": 96.5}}, "description": "VideoMAE model pre-trained for 1600 epochs in a self-supervised way and fine-tuned in a supervised way on Kinetics-400. It was introduced in the paper VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training by Tong et al. and first released in this repository.", "model_name": "MCG-NJU/videomae-large-finetuned-kinetics"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Translation, Summarization, Question Answering, Sentiment Analysis", "api_name": "t5-3b", "api_call": "T5ForConditionalGeneration.from_pretrained('t5-3b')", "performance": {"dataset": "c4", "accuracy": "See research paper, Table 14"}, "description": "T5-3B is a Text-To-Text Transfer Transformer (T5) model with 3 billion parameters. It is designed for various NLP tasks such as translation, summarization, question answering, and sentiment analysis. The model is pre-trained on the Colossal Clean Crawled Corpus (C4) and fine-tuned on multiple supervised and unsupervised tasks.", "model_name": "t5-3b"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "google/ddpm-church-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-church-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) for high-quality image synthesis. Trained on the unconditional CIFAR10 dataset and 256x256 LSUN. Supports different noise schedulers like scheduling_ddpm, scheduling_ddim, and scheduling_pndm for inference.", "model_name": "google/ddpm-church-256"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli", "api_call": "Wav2Vec2ForCTC.from_pretrained('jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli')", "performance": {"dataset": "librispeech validation set", "accuracy": "4.45%"}, "description": "This checkpoint is a wav2vec2-large model that is useful for generating transcriptions with punctuation. It is intended for use in building transcriptions for TTS models, where punctuation is very important for prosody. This model was created by fine-tuning the facebook/wav2vec2-large-robust-ft-libri-960h checkpoint on the libritts and voxpopuli datasets with a new vocabulary that includes punctuation.", "model_name": "jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "ddpm-cifar10-32", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-cifar10-32')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception score": 9.46, "FID score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) for high quality image synthesis. Trained on the unconditional CIFAR10 dataset. Supports various discrete noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm.", "model_name": "google/ddpm-cifar10-32"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "gpt2-large", "api_call": "pipeline('text-generation', model='gpt2-large')", "performance": {"dataset": {"LAMBADA": {"PPL": 10.87}, "CBT-CN": {"ACC": 93.45}, "CBT-NE": {"ACC": 88.0}, "WikiText2": {"PPL": 19.93}, "PTB": {"PPL": 40.31}, "enwiki8": {"BPB": 0.97}, "text8": {"BPC": 1.02}, "WikiText103": {"PPL": 22.05}, "1BW": {"PPL": 44.575}}}, "description": "GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.", "model_name": "gpt2-large"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Natural Language Inference", "api_name": "cross-encoder/nli-MiniLM2-L6-H768", "api_call": "AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-MiniLM2-L6-H768')", "performance": {"dataset": "SNLI and MultiNLI", "accuracy": "See SBERT.net - Pretrained Cross-Encoder for evaluation results"}, "description": "This model was trained using SentenceTransformers Cross-Encoder class on the SNLI and MultiNLI datasets. For a given sentence pair, it will output three scores corresponding to the labels: contradiction, entailment, neutral.", "model_name": "cross-encoder/nli-MiniLM2-L6-H768"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Zero-Shot Classification", "api_name": "cross-encoder/nli-roberta-base", "api_call": "CrossEncoder('cross-encoder/nli-roberta-base')", "performance": {"dataset": ["SNLI", "MultiNLI"], "accuracy": "See SBERT.net - Pretrained Cross-Encoder"}, "description": "Cross-Encoder for Natural Language Inference trained on the SNLI and MultiNLI datasets. Outputs three scores corresponding to the labels: contradiction, entailment, neutral.", "model_name": "cross-encoder/nli-roberta-base"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transcription", "api_name": "openai/whisper-tiny.en", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny.en')", "performance": {"dataset": "LibriSpeech (clean)", "accuracy": 8.437}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning.", "model_name": "openai/whisper-tiny.en"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-base-finetuned-wtq", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-base-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": "Not provided"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries.", "model_name": "microsoft/tapex-base-finetuned-wtq"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Action Recognition", "api_name": "videomae-base-finetuned-ucf101", "api_call": "VideoMAEForVideoClassification.from_pretrained('nateraw/videomae-base-finetuned-ucf101')", "performance": {"dataset": "UCF101", "accuracy": 0.758209765}, "description": "VideoMAE Base model fine tuned on UCF101 for Video Action Recognition", "model_name": "nateraw/videomae-base-finetuned-ucf101"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "baseline-trainer", "api_name": "merve/tips9y0jvt5q-tip-regression", "api_call": "pipeline('tabular-regression', model='merve/tips9y0jvt5q-tip-regression')", "performance": {"dataset": "tips9y0jvt5q", "accuracy": {"r2": 0.41524000000000005, "neg_mean_squared_error": -1.098792}}, "description": "Baseline Model trained on tips9y0jvt5q to apply regression on tip. The model uses Ridge(alpha=10) and is trained with dabl library as a baseline. For better results, use AutoTrain.", "model_name": "merve/tips9y0jvt5q-tip-regression"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text Summarization", "api_name": "facebook/bart-large-cnn", "api_call": "pipeline('summarization', model='facebook/bart-large-cnn')", "performance": {"dataset": "cnn_dailymail", "accuracy": {"ROUGE-1": 42.949, "ROUGE-2": 20.815, "ROUGE-L": 30.619, "ROUGE-LSUM": 40.038}}, "description": "BART (large-sized model), fine-tuned on CNN Daily Mail. BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering). This particular checkpoint has been fine-tuned on CNN Daily Mail, a large collection of text-summary pairs.", "model_name": "facebook/bart-large-cnn"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "api_call": "SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')", "performance": {"dataset": [{"name": "WikiAnswers", "accuracy": "77,427,422"}, {"name": "PAQ", "accuracy": "64,371,441"}, {"name": "Stack Exchange", "accuracy": "25,316,456"}, {"name": "MS MARCO", "accuracy": "17,579,773"}, {"name": "GOOAQ", "accuracy": "3,012,496"}, {"name": "Amazon-QA", "accuracy": "2,448,839"}, {"name": "Yahoo Answers", "accuracy": "1,198,260"}, {"name": "SearchQA", "accuracy": "582,261"}, {"name": "ELI5", "accuracy": "325,475"}, {"name": "Quora", "accuracy": "103,663"}, {"name": "Natural Questions (NQ)", "accuracy": "100,231"}, {"name": "SQuAD2.0", "accuracy": "87,599"}, {"name": "TriviaQA", "accuracy": "73,346"}]}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 384-dimensional dense vector space and was designed for semantic search. It has been trained on 215M (question, answer) pairs from diverse sources.", "model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Voice Activity Detection", "api_name": "julien-c/voice-activity-detection", "api_call": "Inference('julien-c/voice-activity-detection', device='cuda')", "performance": {"dataset": "dihard", "accuracy": "Not provided"}, "description": "Example pyannote-audio Voice Activity Detection model using PyanNet. Imported from https://github.com/pyannote/pyannote-audio-hub and trained by @hbredin.", "model_name": "julien-c/voice-activity-detection"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Sentiment Analysis", "api_name": "cardiffnlp/twitter-roberta-base-sentiment-latest", "api_call": "pipeline(sentiment-analysis, model=AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest'), tokenizer=AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest'))", "performance": {"dataset": "tweet_eval", "accuracy": "Not provided"}, "description": "This is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the TweetEval benchmark. The model is suitable for English.", "model_name": "cardiffnlp/twitter-roberta-base-sentiment-latest"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Entity Extraction", "api_name": "904029577", "api_call": "AutoModelForTokenClassification.from_pretrained('ismail-lucifer011/autotrain-name_all-904029577', use_auth_token=True)", "performance": {"dataset": "ismail-lucifer011/autotrain-data-name_all", "accuracy": 0.9989316041}, "description": "This model is trained using AutoTrain for entity extraction. It is based on the DistilBert architecture and has a CO2 Emissions of 0.8375653425894861 grams.", "model_name": "ismail-lucifer011/autotrain-name_all-904029577"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Text Classification", "api_name": "distilbert-base-uncased-finetuned-sst-2-english", "api_call": "DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')", "performance": {"dataset": "glue", "accuracy": 0.911}, "description": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. It reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7). This model can be used for topic classification.", "model_name": "distilbert-base-uncased-finetuned-sst-2-english"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "flax-community/clip-rsicd-v2", "api_call": "CLIPModel.from_pretrained('flax-community/clip-rsicd-v2')", "performance": {"dataset": {"RSICD": {"original CLIP": {"k=1": 0.5720000000000001, "k=3": 0.745, "k=5": 0.837, "k=10": 0.9390000000000001}, "clip-rsicd-v2 (this model)": {"k=1": 0.883, "k=3": 0.968, "k=5": 0.982, "k=10": 0.998}}}}, "description": "This model is a fine-tuned CLIP by OpenAI. It is designed with an aim to improve zero-shot image classification, text-to-image and image-to-image retrieval specifically on remote sensing images.", "model_name": "flax-community/clip-rsicd-v2"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "blip2-flan-t5-xxl", "api_call": "Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-flan-t5-xxl')", "performance": {"dataset": "LAION", "accuracy": "Not provided"}, "description": "BLIP-2 model, leveraging Flan T5-xxl (a large language model). It was introduced in the paper BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models by Li et al. and first released in this repository. The model is used for tasks like image captioning, visual question answering (VQA), and chat-like conversations by feeding the image and the previous conversation as prompt to the model.", "model_name": "Salesforce/blip2-flan-t5-xxl"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-base-finetuned-sqa", "api_call": "TapasTokenizer.from_pretrained('google/tapas-base-finetuned-sqa')", "performance": {"dataset": "msr_sqa", "accuracy": 0.6874}, "description": "TAPAS base model fine-tuned on Sequential Question Answering (SQA). It is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia and fine-tuned on SQA. It can be used for answering questions related to a table in a conversational set-up.", "model_name": "google/tapas-base-finetuned-sqa"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image Segmentation", "api_name": "lllyasviel/sd-controlnet-seg", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-seg')", "performance": {"dataset": "ADE20K", "accuracy": "Trained on 164K segmentation-image, caption pairs"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Image Segmentation. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-seg"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tinkoff-ai/ruDialoGPT-medium", "api_call": "AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')", "performance": {"dataset": "Private Validation Set", "sensibleness": 0.78, "specificity": 0.6900000000000001, "SSA": 0.735}, "description": "This generation model is based on sberbank-ai/rugpt3medium_based_on_gpt2. It's trained on large corpus of dialog data and can be used for buildning generative conversational agents. The model was trained with context size 3.", "model_name": "tinkoff-ai/ruDialoGPT-medium"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Image Captioning", "api_name": "nlpconnect/vit-gpt2-image-captioning", "api_call": "VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning')", "performance": {"dataset": "Not provided", "accuracy": "Not provided"}, "description": "An image captioning model that uses transformers to generate captions for input images. The model is based on the Illustrated Image Captioning using transformers approach.", "model_name": "nlpconnect/vit-gpt2-image-captioning"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "roberta-large", "api_call": "pipeline('fill-mask', model='roberta-large')", "performance": {"dataset": "GLUE", "accuracy": {"MNLI": 90.2, "QQP": 92.2, "QNLI": 94.7, "SST-2": 96.4, "CoLA": 68.0, "STS-B": 96.4, "MRPC": 90.9, "RTE": 86.6}}, "description": "RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion using the Masked language modeling (MLM) objective. It can be fine-tuned on a downstream task, such as sequence classification, token classification, or question answering.", "model_name": "roberta-large"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "valhalla/distilbart-mnli-12-1", "api_call": "pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-1')", "performance": {"dataset": "MNLI", "matched_accuracy": 87.08, "mismatched_accuracy": 87.5}, "description": "distilbart-mnli is the distilled version of bart-large-mnli created using the No Teacher Distillation technique proposed for BART summarisation by Huggingface. It is designed for zero-shot classification tasks.", "model_name": "valhalla/distilbart-mnli-12-1"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "tiennvcs/layoutlmv2-large-uncased-finetuned-vi-infovqa", "api_call": "pipeline('question-answering', model='tiennvcs/layoutlmv2-large-uncased-finetuned-vi-infovqa')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 8.5806}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-large-uncased on an unknown dataset.", "model_name": "tiennvcs/layoutlmv2-large-uncased-finetuned-vi-infovqa"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image-to-Image", "api_name": "lllyasviel/control_v11p_sd15_canny", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_canny')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Controlnet v1.1 is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Canny edges. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5.", "model_name": "lllyasviel/control_v11p_sd15_canny"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "GanjinZero/UMLSBert_ENG", "api_call": "AutoModel.from_pretrained('GanjinZero/UMLSBert_ENG')", "performance": {"dataset": "", "accuracy": ""}, "description": "CODER: Knowledge infused cross-lingual medical term embedding for term normalization. English Version. Old name. This model is not UMLSBert! Github Link: https://github.com/GanjinZero/CODER", "model_name": "GanjinZero/UMLSBert_ENG"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mazkooleg/0-9up-hubert-base-ls960-ft", "api_call": "pipeline('audio-classification', model='mazkooleg/0-9up-hubert-base-ls960-ft')", "performance": {"dataset": "mazkooleg/0-9up_google_speech_commands_augmented_raw", "accuracy": 0.9973000000000001}, "description": "This model is a fine-tuned version of facebook/hubert-base-ls960 on the None dataset. It achieves an accuracy of 0.9973 on the evaluation set.", "model_name": "mazkooleg/0-9up-hubert-base-ls960-ft"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Image generation and modification based on text prompts", "api_name": "stabilityai/stable-diffusion-x4-upscaler", "api_call": "StableDiffusionUpscalePipeline.from_pretrained('stabilityai/stable-diffusion-x4-upscaler', torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion x4 upscaler is a latent diffusion model trained on a 10M subset of LAION containing images >2048x2048. It can be used to generate and modify images based on text prompts. The model receives a noise_level as an input parameter, which can be used to add noise to the low-resolution input according to a predefined diffusion schedule. The model is trained with English captions and might not work well with other languages.", "model_name": "stabilityai/stable-diffusion-x4-upscaler"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Visual Question Answering", "api_name": "blip-vqa-base", "api_call": "BlipForQuestionAnswering.from_pretrained('Salesforce/blip-vqa-base')", "performance": {"dataset": "VQA", "accuracy": "+1.6% in VQA score"}, "description": "BLIP is a Vision-Language Pre-training (VLP) framework that transfers flexibly to both vision-language understanding and generation tasks. It effectively utilizes noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. This model is trained on visual question answering with a base architecture (using ViT base backbone).", "model_name": "Salesforce/blip-vqa-base"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "lanwuwei/BERTOverflow_stackoverflow_github", "api_call": "AutoModelForTokenClassification.from_pretrained('lanwuwei/BERTOverflow_stackoverflow_github')", "performance": {"dataset": "StackOverflow's 10 year archive", "accuracy": "Not provided"}, "description": "BERT-base model pre-trained on 152 million sentences from the StackOverflow's 10 year archive. It can be used for code and named entity recognition in StackOverflow.", "model_name": "lanwuwei/BERTOverflow_stackoverflow_github"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-large-sql-execution", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-large-sql-execution')", "performance": {"dataset": "synthetic corpus", "accuracy": "not specified"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries. TAPEX is based on the BART architecture, the transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder.", "model_name": "microsoft/tapex-large-sql-execution"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image-to-Image", "api_name": "lllyasviel/sd-controlnet-canny", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-canny')", "performance": {"dataset": "3M edge-image, caption pairs", "accuracy": "600 GPU-hours with Nvidia A100 80G"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Canny edges. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-canny"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face", "functionality": "Voice Activity Detection", "api_name": "Eklavya/ZFF_VAD", "api_call": "pipeline('voice-activity-detection', model='Eklavya/ZFF_VAD')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A Voice Activity Detection model by Eklavya, using the Hugging Face framework.", "model_name": "Eklavya/ZFF_VAD"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face", "functionality": "Voice Activity Detection", "api_name": "FSMN-VAD", "api_call": "pipeline('voice-activity-detection', model='funasr/FSMN-VAD')", "performance": {"dataset": "", "accuracy": ""}, "description": "FSMN-VAD model for Voice Activity Detection using Hugging Face Transformers library.", "model_name": "funasr/FSMN-VAD"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Language Identification", "api_name": "sanchit-gandhi/whisper-medium-fleurs-lang-id", "api_call": "AutoModelForSpeechClassification.from_pretrained('sanchit-gandhi/whisper-medium-fleurs-lang-id')", "performance": {"dataset": "google/xtreme_s", "accuracy": 0.8805000000000001}, "description": "This model is a fine-tuned version of openai/whisper-medium on the FLEURS subset of the google/xtreme_s dataset. It is used for language identification in audio classification tasks.", "model_name": "sanchit-gandhi/whisper-medium-fleurs-lang-id"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "google/pegasus-xsum", "api_call": "pipeline('summarization', model='google/pegasus-xsum')", "performance": {"dataset": [{"name": "xsum", "accuracy": {"ROUGE-1": 46.862, "ROUGE-2": 24.453, "ROUGE-L": 39.055, "ROUGE-LSUM": 39.099}}, {"name": "cnn_dailymail", "accuracy": {"ROUGE-1": 22.206, "ROUGE-2": 7.67, "ROUGE-L": 15.405, "ROUGE-LSUM": 19.218}}, {"name": "samsum", "accuracy": {"ROUGE-1": 21.81, "ROUGE-2": 4.253, "ROUGE-L": 17.447, "ROUGE-LSUM": 18.891}}]}, "description": "PEGASUS is a pre-trained model for abstractive summarization, developed by Google. It is based on the Transformer architecture and trained on both C4 and HugeNews datasets. The model is designed to extract gap sentences and generate summaries by stochastically sampling important sentences.", "model_name": "google/pegasus-xsum"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "Davlan/bert-base-multilingual-cased-ner-hrl", "api_call": "AutoModelForTokenClassification.from_pretrained('Davlan/bert-base-multilingual-cased-ner-hrl')", "performance": {"dataset": {"Arabic": "ANERcorp", "German": "conll 2003", "English": "conll 2003", "Spanish": "conll 2002", "French": "Europeana Newspapers", "Italian": "Italian I-CAB", "Latvian": "Latvian NER", "Dutch": "conll 2002", "Portuguese": "Paramopama + Second Harem", "Chinese": "MSRA"}, "accuracy": "Not provided"}, "description": "bert-base-multilingual-cased-ner-hrl is a Named Entity Recognition model for 10 high resourced languages (Arabic, German, English, Spanish, French, Italian, Latvian, Dutch, Portuguese and Chinese) based on a fine-tuned mBERT base model. It has been trained to recognize three types of entities: location (LOC), organizations (ORG), and person (PER).", "model_name": "Davlan/bert-base-multilingual-cased-ner-hrl"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "madhurjindal/autonlp-Gibberish-Detector-492513457", "api_call": "AutoModelForSequenceClassification.from_pretrained('madhurjindal/autonlp-Gibberish-Detector-492513457')", "performance": {"dataset": "madhurjindal/autonlp-data-Gibberish-Detector", "accuracy": 0.9735624587}, "description": "A multi-class text classification model for detecting gibberish text. Trained using AutoNLP and DistilBERT.", "model_name": "madhurjindal/autonlp-Gibberish-Detector-492513457"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text Summarization", "api_name": "Randeng-Pegasus-238M-Summary-Chinese", "api_call": "PegasusForConditionalGeneration.from_pretrained('IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese')", "performance": {"dataset": "LCSTS", "accuracy": {"rouge-1": 43.46, "rouge-2": 29.59, "rouge-L": 39.76}}, "description": "Randeng-Pegasus-238M-Summary-Chinese is a Chinese text summarization model based on Pegasus. It is fine-tuned on 7 Chinese text summarization datasets including education, new2016zh, nlpcc, shence, sohu, thucnews, and weibo. The model can be used to generate summaries for Chinese text inputs.", "model_name": "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Monocular Depth Estimation", "api_name": "Intel/dpt-large", "api_call": "DPTForDepthEstimation.from_pretrained('Intel/dpt-large')", "performance": {"dataset": "MIX 6", "accuracy": "10.82"}, "description": "Dense Prediction Transformer (DPT) model trained on 1.4 million images for monocular depth estimation. Introduced in the paper Vision Transformers for Dense Prediction by Ranftl et al. (2021). DPT uses the Vision Transformer (ViT) as backbone and adds a neck + head on top for monocular depth estimation.", "model_name": "Intel/dpt-large"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "naver-clova-ix/donut-base", "api_call": "AutoModel.from_pretrained('naver-clova-ix/donut-base')", "performance": {"dataset": "arxiv:2111.15664", "accuracy": "Not provided"}, "description": "Donut consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings (of shape batch_size, seq_len, hidden_size), after which the decoder autoregressively generates text, conditioned on the encoding of the encoder.", "model_name": "naver-clova-ix/donut-base"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Table Extraction", "api_name": "keremberke/yolov8s-table-extraction", "api_call": "YOLO('keremberke/yolov8s-table-extraction')", "performance": {"dataset": "table-extraction", "accuracy": 0.984}, "description": "A YOLOv8 model for table extraction in documents, capable of detecting bordered and borderless tables. Trained on the table-extraction dataset, the model achieves a mAP@0.5 of 0.984 on the validation set.", "model_name": "keremberke/yolov8s-table-extraction"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/resnet-18", "api_call": "ResNetForImageClassification.from_pretrained('microsoft/resnet-18')", "performance": {"dataset": "imagenet-1k"}, "description": "ResNet model trained on imagenet-1k. It was introduced in the paper Deep Residual Learning for Image Recognition and first released in this repository. ResNet introduced residual connections, they allow to train networks with an unseen number of layers (up to 1000). ResNet won the 2015 ILSVRC & COCO competition, one important milestone in deep computer vision.", "model_name": "microsoft/resnet-18"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-350m", "api_call": "pipeline('text-generation', model='facebook/opt-350m')", "performance": {"dataset": "BookCorpus, CC-Stories, The Pile, Pushshift.io Reddit, CCNewsV2", "accuracy": "Roughly matches GPT-3 performance"}, "description": "OPT (Open Pre-trained Transformer) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, developed by Meta AI. It is designed to enable reproducible and responsible research at scale and bring more voices to the table in studying the impact of large language models. The pretrained-only model can be used for prompting for evaluation of downstream tasks as well as text generation. It can also be fine-tuned on a downstream task using the CLM example.", "model_name": "facebook/opt-350m"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "Acrobot-v1", "api_name": "sb3/dqn-Acrobot-v1", "api_call": "load_from_hub(repo_id='sb3/dqn-Acrobot-v1',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "Acrobot-v1", "accuracy": "-72.10 +/- 6.44"}, "description": "This is a trained model of a DQN agent playing Acrobot-v1 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/dqn-Acrobot-v1"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video", "api_name": "chavinlo/TempoFunk", "api_call": "pipeline('text-to-video', model='chavinlo/TempoFunk')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Text-to-Video model using Hugging Face Transformers library. Model is capable of generating video content based on the input text.", "model_name": "chavinlo/TempoFunk"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu-finetuned-diode-221116-054332", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221116-054332')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.6028, "Rmse": "nan"}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221116-054332"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "ruDialoGpt3-medium-finetuned-telegram", "api_call": "AutoModelForCausalLM.from_pretrained('ruDialoGpt3-medium-finetuned-telegram')", "performance": {"dataset": "Russian forums and Telegram chat", "accuracy": "Not available"}, "description": "DialoGPT trained on Russian language and fine tuned on my telegram chat. This model was created by sberbank-ai and trained on Russian forums. It has been fine-tuned on a 30mb json file of exported telegram chat data.", "model_name": "ruDialoGpt3-medium-finetuned-telegram"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/swin-tiny-patch4-window7-224", "api_call": "SwinForImageClassification.from_pretrained('microsoft/swin-tiny-patch4-window7-224')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not specified"}, "description": "Swin Transformer model trained on ImageNet-1k at resolution 224x224. It was introduced in the paper Swin Transformer: Hierarchical Vision Transformer using Shifted Windows by Liu et al. and first released in this repository. The Swin Transformer is a type of Vision Transformer. It builds hierarchical feature maps by merging image patches (shown in gray) in deeper layers and has linear computation complexity to input image size due to computation of self-attention only within each local window (shown in red). It can thus serve as a general-purpose backbone for both image classification and dense recognition tasks.", "model_name": "microsoft/swin-tiny-patch4-window7-224"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "financial-sentiment-analysis", "api_name": "ProsusAI/finbert", "api_call": "AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')", "performance": {"dataset": "Financial PhraseBank", "accuracy": "Not provided"}, "description": "FinBERT is a pre-trained NLP model to analyze sentiment of financial text. It is built by further training the BERT language model in the finance domain, using a large financial corpus and thereby fine-tuning it for financial sentiment classification. Financial PhraseBank by Malo et al. (2014) is used for fine-tuning.", "model_name": "ProsusAI/finbert"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "deformable-detr", "api_call": "DeformableDetrForObjectDetection.from_pretrained('SenseTime/deformable-detr')", "performance": {"dataset": "COCO 2017", "accuracy": "Not provided"}, "description": "Deformable DETR model with ResNet-50 backbone trained end-to-end on COCO 2017 object detection (118k annotated images). It was introduced in the paper Deformable DETR: Deformable Transformers for End-to-End Object Detection by Zhu et al. and first released in this repository.", "model_name": "SenseTime/deformable-detr"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "ControlNet", "api_name": "lllyasviel/control_v11p_sd15_lineart", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_lineart')", "performance": {"dataset": "ControlNet-1-1-preview", "accuracy": "Not provided"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on lineart images.", "model_name": "lllyasviel/control_v11p_sd15_lineart"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "ingen51/DialoGPT-medium-GPT4", "api_call": "pipeline('conversational', model='ingen51/DialoGPT-medium-GPT4')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A GPT-4 model for generating conversational responses in a dialogue setting.", "model_name": "ingen51/DialoGPT-medium-GPT4"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "swin-tiny-patch4-window7-224-bottom_cleaned_data", "api_call": "AutoModelForImageClassification.from_pretrained('microsoft/swin-tiny-patch4-window7-224-bottom_cleaned_data')", "performance": {"dataset": "imagefolder", "accuracy": 0.9726}, "description": "This model is a fine-tuned version of microsoft/swin-tiny-patch4-window7-224 on the imagefolder dataset.", "model_name": "microsoft/swin-tiny-patch4-window7-224-bottom_cleaned_data"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mazkooleg/0-9up-data2vec-audio-base-960h-ft", "api_call": "pipeline('audio-classification', model='mazkooleg/0-9up-data2vec-audio-base-960h-ft')", "performance": {"dataset": "None", "accuracy": 0.9967}, "description": "This model is a fine-tuned version of facebook/data2vec-audio-base-960h on the None dataset.", "model_name": "mazkooleg/0-9up-data2vec-audio-base-960h-ft"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "vision-encoder-decoder", "api_name": "naver-clova-ix/donut-base-finetuned-docvqa", "api_call": "pipeline('document-question-answering', model='donut-base-finetuned-docvqa')", "performance": {"dataset": "DocVQA", "accuracy": "Not provided"}, "description": "Donut model fine-tuned on DocVQA. It was introduced in the paper OCR-free Document Understanding Transformer by Geewok et al. and first released in this repository. Donut consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings (of shape batch_size, seq_len, hidden_size), after which the decoder autoregressively generates text, conditioned on the encoding of the encoder.", "model_name": "donut-base-finetuned-docvqa"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/LaBSE", "api_call": "SentenceTransformer('sentence-transformers/LaBSE')", "performance": {"dataset": "Sentence Embeddings Benchmark", "accuracy": "https://seb.sbert.net"}, "description": "This is a port of the LaBSE model to PyTorch. It can be used to map 109 languages to a shared vector space.", "model_name": "sentence-transformers/LaBSE"}
{"domain": "Natural Language Processing Summarization", "framework": "Transformers", "functionality": "Code Documentation Generation", "api_name": "code_trans_t5_base_code_documentation_generation_python", "api_call": "AutoModelWithLMHead.from_pretrained('SEBIS/code_trans_t5_base_code_documentation_generation_python')", "performance": {"dataset": "CodeSearchNet Corpus python dataset", "accuracy": "20.26 BLEU score"}, "description": "This CodeTrans model is based on the t5-base model and is trained on tokenized python code functions. It can be used to generate descriptions for python functions or be fine-tuned on other python code tasks. The model works best with tokenized python functions but can also be used on unparsed and untokenized python code.", "model_name": "SEBIS/code_trans_t5_base_code_documentation_generation_python"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "lakahaga/novel_reading_tts", "api_call": "AutoModelForTTS.from_pretrained('lakahaga/novel_reading_tts')", "performance": {"dataset": "novelspeech", "accuracy": null}, "description": "This model was trained by lakahaga using novelspeech recipe in espnet. It is designed for Korean text-to-speech tasks.", "model_name": "lakahaga/novel_reading_tts"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Table Question Answering", "api_name": "lysandre/tapas-temporary-repo", "api_call": "TapasForQuestionAnswering.from_pretrained('lysandre/tapas-temporary-repo')", "performance": {"dataset": "SQA", "accuracy": "Not provided"}, "description": "TAPAS base model fine-tuned on Sequential Question Answering (SQA). This model is pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion and can be used for answering questions related to a table in a conversational set-up.", "model_name": "lysandre/tapas-temporary-repo"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-small-finetuned-wikisql-supervised", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-small-finetuned-wikisql-supervised')", "performance": {"dataset": "wikisql", "accuracy": "Not specified"}, "description": "TAPAS is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion. This model is fine-tuned on WikiSQL and can be used for answering questions related to a table.", "model_name": "google/tapas-small-finetuned-wikisql-supervised"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu-finetuned-diode-221116-062619", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221116-062619')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.548, "Rmse": "nan"}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221116-062619"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Fill-Mask", "api_name": "microsoft/deberta-base", "api_call": "DebertaModel.from_pretrained('microsoft/deberta-base')", "performance": {"dataset": {"SQuAD 1.1": "93.1/87.2", "SQuAD 2.0": "86.2/83.1", "MNLI-m": "88.8"}}, "description": "DeBERTa improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. It outperforms BERT and RoBERTa on majority of NLU tasks with 80GB training data.", "model_name": "microsoft/deberta-base"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "git-large-textcaps", "api_call": "AutoModelForCausalLM.from_pretrained('microsoft/git-large-textcaps')", "performance": {"dataset": "TextCaps", "accuracy": "Refer to the paper"}, "description": "GIT (short for GenerativeImage2Text) model, large-sized version, fine-tuned on TextCaps. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is trained using 'teacher forcing' on a lot of (image, text) pairs. The goal for the model is simply to predict the next text token, giving the image tokens and previous text tokens. This allows the model to be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification (by simply conditioning the model on the image and asking it to generate a class for it in text).", "model_name": "microsoft/git-large-textcaps"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "fastspeech2-en-ljspeech", "api_call": "'TTSHubInterface.get_prediction('facebook/fastspeech2-en-ljspeech')'", "performance": {"dataset": "LJSpeech", "accuracy": "N/A"}, "description": "FastSpeech 2 text-to-speech model from fairseq S^2. English single-speaker female voice trained on LJSpeech.", "model_name": "facebook/fastspeech2-en-ljspeech"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Sentiment Analysis", "api_name": "finiteautomata/beto-sentiment-analysis", "api_call": "pipeline('sentiment-analysis', model='finiteautomata/beto-sentiment-analysis')", "performance": {"dataset": "TASS 2020 corpus", "accuracy": ""}, "description": "Model trained with TASS 2020 corpus (around ~5k tweets) of several dialects of Spanish. Base model is BETO, a BERT model trained in Spanish. Uses POS, NEG, NEU labels.", "model_name": "finiteautomata/beto-sentiment-analysis"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Audio Classification", "api_name": "ast-finetuned-audioset-10-10-0.4593", "api_call": "pipeline('audio-classification', model='MIT/ast-finetuned-audioset-10-10-0.4593')", "performance": {"dataset": "AudioSet", "accuracy": ""}, "description": "Audio Spectrogram Transformer (AST) model fine-tuned on AudioSet. It was introduced in the paper AST: Audio Spectrogram Transformer by Gong et al. and first released in this repository. The Audio Spectrogram Transformer is equivalent to ViT, but applied on audio. Audio is first turned into an image (as a spectrogram), after which a Vision Transformer is applied. The model gets state-of-the-art results on several audio classification benchmarks.", "model_name": "MIT/ast-finetuned-audioset-10-10-0.4593"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Grammar Synthesis", "api_name": "pszemraj/flan-t5-large-grammar-synthesis", "api_call": "pipeline('text2text-generation', 'pszemraj/flan-t5-large-grammar-synthesis')", "performance": {"dataset": "jfleg", "accuracy": "Not provided"}, "description": "A fine-tuned version of google/flan-t5-large for grammar correction on an expanded version of the JFLEG dataset.", "model_name": "pszemraj/flan-t5-large-grammar-synthesis"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "prithivida/parrot_adequacy_model", "api_call": "pipeline('text-classification', model='prithivida/parrot_adequacy_model')", "performance": {"dataset": "", "accuracy": ""}, "description": "Parrot is a paraphrase-based utterance augmentation framework purpose-built to accelerate training NLU models. This model is an ancillary model for Parrot paraphraser.", "model_name": "prithivida/parrot_adequacy_model"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Language Identification", "api_name": "lang-id-voxlingua107-ecapa", "api_call": "EncoderClassifier.from_hparams(source='speechbrain/lang-id-voxlingua107-ecapa', savedir='/tmp')", "performance": {"dataset": "VoxLingua107 development dataset", "accuracy": "93.3%"}, "description": "This is a spoken language recognition model trained on the VoxLingua107 dataset using SpeechBrain. The model uses the ECAPA-TDNN architecture that has previously been used for speaker recognition. It covers 107 different languages.", "model_name": "speechbrain/lang-id-voxlingua107-ecapa"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "Recognai/bert-base-spanish-wwm-cased-xnli", "api_call": "AutoModelForSequenceClassification.from_pretrained('Recognai/bert-base-spanish-wwm-cased-xnli')", "performance": {"dataset": "XNLI-es", "accuracy": "79.9%"}, "description": "This model is a fine-tuned version of the spanish BERT model with the Spanish portion of the XNLI dataset. You can have a look at the training script for details of the training.", "model_name": "Recognai/bert-base-spanish-wwm-cased-xnli"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text-to-Text Generation", "api_name": "philschmid/bart-large-cnn-samsum", "api_call": "pipeline('summarization', model='philschmid/bart-large-cnn-samsum')", "performance": {"dataset": "samsum", "accuracy": {"eval_rouge1": 42.621, "eval_rouge2": 21.9825, "eval_rougeL": 33.034, "eval_rougeLsum": 39.6783, "test_rouge1": 41.3174, "test_rouge2": 20.8716, "test_rougeL": 32.1337, "test_rougeLsum": 38.4149}}, "description": "philschmid/bart-large-cnn-samsum is a BART-based model trained for text summarization on the SAMSum dataset. It can be used to generate abstractive summaries of conversations.", "model_name": "philschmid/bart-large-cnn-samsum"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-hard-hat-detection", "api_call": "YOLO('keremberke/yolov8m-hard-hat-detection')", "performance": {"dataset": "hard-hat-detection", "accuracy": 0.811}, "description": "A YOLOv8 model for detecting hard hats in images. The model can distinguish between 'Hardhat' and 'NO-Hardhat' classes. It can be used to ensure safety compliance in construction sites or other industrial environments where hard hats are required.", "model_name": "keremberke/yolov8m-hard-hat-detection"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "facebook/convnext-tiny-224", "api_call": "ConvNextForImageClassification.from_pretrained('facebook/convnext-tiny-224')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not specified"}, "description": "ConvNeXT is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, that claims to outperform them. It is trained on ImageNet-1k at resolution 224x224 and can be used for image classification.", "model_name": "facebook/convnext-tiny-224"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "git-large-textvqa", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('microsoft/git-large-textvqa')", "performance": {"dataset": "TextVQA", "accuracy": "See table 11 in the paper for more details."}, "description": "GIT (short for GenerativeImage2Text) model, large-sized version, fine-tuned on TextVQA. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is trained using 'teacher forcing' on a lot of (image, text) pairs. The goal for the model is simply to predict the next text token, giving the image tokens and previous text tokens. This allows the model to be used for tasks like: image and video captioning, visual question answering (VQA) on images and videos, and even image classification (by simply conditioning the model on the image and asking it to generate a class for it in text).", "model_name": "microsoft/git-large-textvqa"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-base-finetuned-ssv2", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-base-finetuned-ssv2')", "performance": {"dataset": "Something-Something-v2", "accuracy": {"top-1": 70.6, "top-5": 92.6}}, "description": "VideoMAE model pre-trained for 2400 epochs in a self-supervised way and fine-tuned in a supervised way on Something-Something-v2. It was introduced in the paper VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training by Tong et al. and first released in this repository.", "model_name": "MCG-NJU/videomae-base-finetuned-ssv2"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-en-ROMANCE", "api_call": "pipeline('translation_en_to_ROMANCE', model='Helsinki-NLP/opus-mt-en-ROMANCE')", "performance": {"dataset": "opus", "accuracy": {"BLEU": 50.1, "chr-F": 0.6930000000000001}}, "description": "A translation model trained on the OPUS dataset that supports translation between English and various Romance languages. It uses a transformer architecture and requires a sentence initial language token in the form of >>id<< (id = valid target language ID).", "model_name": "Helsinki-NLP/opus-mt-en-ROMANCE"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Diffusion-based text-to-image generation", "api_name": "lllyasviel/control_v11p_sd15_seg", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_seg')", "performance": {"dataset": "COCO", "accuracy": "Not specified"}, "description": "ControlNet v1.1 is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on seg images. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5.", "model_name": "lllyasviel/control_v11p_sd15_seg"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "michellejieli/emotion_text_classifier", "api_call": "pipeline('sentiment-analysis', model='michellejieli/emotion_text_classifier')", "performance": {"dataset": ["Crowdflower (2016)", "Emotion Dataset, Elvis et al. (2018)", "GoEmotions, Demszky et al. (2020)", "ISEAR, Vikash (2018)", "MELD, Poria et al. (2019)", "SemEval-2018, EI-reg, Mohammad et al. (2018)", "Emotion Lines (Friends)"], "accuracy": "Not provided"}, "description": "DistilRoBERTa-base is a transformer model that performs sentiment analysis. I fine-tuned the model on transcripts from the Friends show with the goal of classifying emotions from text data, specifically dialogue from Netflix shows or movies. The model predicts 6 Ekman emotions and a neutral class. These emotions include anger, disgust, fear, joy, neutrality, sadness, and surprise.", "model_name": "michellejieli/emotion_text_classifier"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu-finetuned-diode", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-nyu-finetuned-diode')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.4359, "Rmse": 0.42760000000000004}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode"}
{"domain": "Multimodal Graph Machine Learning", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "graphormer-base-pcqm4mv1", "api_call": "AutoModel.from_pretrained('graphormer-base-pcqm4mv1')", "performance": {"dataset": "PCQM4M-LSC", "accuracy": "1st place on the KDD CUP 2021 (quantum prediction track)"}, "description": "The Graphormer is a graph Transformer model, pretrained on PCQM4M-LSC, and which got 1st place on the KDD CUP 2021 (quantum prediction track). Developed by Microsoft, this model should be used for graph classification tasks or graph representation tasks; the most likely associated task is molecule modeling. It can either be used as such, or finetuned on downstream tasks.", "model_name": "graphormer-base-pcqm4mv1"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "Einmalumdiewelt/T5-Base_GNAD", "api_call": "pipeline('summarization', model='Einmalumdiewelt/T5-Base_GNAD')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 2.1025, "Rouge1": 27.5357, "Rouge2": 8.5623, "Rougel": 19.1508, "Rougelsum": 23.9029, "Gen Len": 52.7253}}, "description": "This model is a fine-tuned version of Einmalumdiewelt/T5-Base_GNAD on an unknown dataset. It is intended for German text summarization.", "model_name": "Einmalumdiewelt/T5-Base_GNAD"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kb", "api_call": "AutoModelForVideoClassification.from_pretrained('lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kb')", "performance": {"dataset": "unknown", "accuracy": 0.7298}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base-finetuned-kinetics on an unknown dataset. It achieves the following results on the evaluation set: Loss: 0.5482, Accuracy: 0.7298.", "model_name": "lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000-epochs8-batch8-kb"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-fr", "api_call": "translate('input_text', model='Helsinki-NLP/opus-mt-en-fr')", "performance": {"dataset": "opus", "accuracy": {"BLEU": {"newsdiscussdev2015-enfr.en.fr": 33.8, "newsdiscusstest2015-enfr.en.fr": 40.0, "newssyscomb2009.en.fr": 29.8, "news-test2008.en.fr": 27.5, "newstest2009.en.fr": 29.4, "newstest2010.en.fr": 32.7, "newstest2011.en.fr": 34.3, "newstest2012.en.fr": 31.8, "newstest2013.en.fr": 33.2, "Tatoeba.en.fr": 50.5}}}, "description": "Helsinki-NLP/opus-mt-en-fr is a translation model that translates English text to French using the Hugging Face Transformers library. It is based on the OPUS dataset and uses a transformer-align architecture with normalization and SentencePiece pre-processing.", "model_name": "Helsinki-NLP/opus-mt-en-fr"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese", "api_call": "SpeechRecognitionModel('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese')", "performance": {"dataset": "mozilla-foundation/common_voice_6_0", "accuracy": {"Test WER": 11.31, "Test CER": 3.74, "Test WER (+LM)": 9.01, "Test CER (+LM)": 3.21}}, "description": "Fine-tuned facebook/wav2vec2-large-xlsr-53 on Portuguese using the train and validation splits of Common Voice 6.1. When using this model, make sure that your speech input is sampled at 16kHz.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8s-pcb-defect-segmentation", "api_call": "YOLO('keremberke/yolov8s-pcb-defect-segmentation')", "performance": {"dataset": "pcb-defect-segmentation", "accuracy": {"mAP@0.5(box)": 0.515, "mAP@0.5(mask)": 0.491}}, "description": "YOLOv8s model for PCB defect segmentation. The model is trained to detect and segment PCB defects such as Dry_joint, Incorrect_installation, PCB_damage, and Short_circuit.", "model_name": "keremberke/yolov8s-pcb-defect-segmentation"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Detect Bordered and Borderless tables in documents", "api_name": "TahaDouaji/detr-doc-table-detection", "api_call": "DetrForObjectDetection.from_pretrained('TahaDouaji/detr-doc-table-detection')", "performance": {"dataset": "ICDAR2019 Table Dataset", "accuracy": "Not provided"}, "description": "detr-doc-table-detection is a model trained to detect both Bordered and Borderless tables in documents, based on facebook/detr-resnet-50.", "model_name": "TahaDouaji/detr-doc-table-detection"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/blenderbot-90M", "api_call": "AutoModelForCausalLM.from_pretrained('facebook/blenderbot-90M')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not provided"}, "description": "BlenderBot-90M is a conversational AI model developed by Facebook AI. It is trained on the Blended Skill Talk dataset and aims to provide engaging and human-like responses in a multi-turn dialogue setting. The model is deprecated, and it is recommended to use the identical model https://huggingface.co/facebook/blenderbot_small-90M instead.", "model_name": "facebook/blenderbot-90M"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Program Synthesis", "api_name": "Salesforce/codegen-2B-multi", "api_call": "AutoModelForCausalLM.from_pretrained('Salesforce/codegen-2B-multi')", "performance": {"dataset": "HumanEval, MTPB"}, "description": "CodeGen is a family of autoregressive language models for program synthesis. The models are originally released in this repository, under 3 pre-training data variants (NL, Multi, Mono) and 4 model size variants (350M, 2B, 6B, 16B). The checkpoint included in this repository is denoted as CodeGen-Multi 2B, where Multi means the model is initialized with CodeGen-NL 2B and further pre-trained on a dataset of multiple programming languages, and 2B refers to the number of trainable parameters.", "model_name": "Salesforce/codegen-2B-multi"}
{"domain": "Reinforcement Learning", "framework": "Unity ML-Agents Library", "functionality": "Train and play SoccerTwos", "api_name": "poca-SoccerTwosv2", "api_call": "mlagents-load-from-hf --repo-id='Raiden-1001/poca-SoccerTwosv2' --local-dir='./downloads'", "performance": {"dataset": "SoccerTwos", "accuracy": "Not provided"}, "description": "A trained model of a poca agent playing SoccerTwos using the Unity ML-Agents Library.", "model_name": "Raiden-1001/poca-SoccerTwosv2"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "LayoutLMX_pt_question_answer_ocrazure_correct_V15_30_03_2023", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V15_30_03_2023')", "performance": {"dataset": {}, "accuracy": {}}, "description": "A document question answering model based on LayoutLMv2, which can be used to extract answers from images with text and layout information.", "model_name": "L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V15_30_03_2023"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan", "api_call": "Text2Speech.from_pretrained('espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan')", "performance": {"dataset": "LJSpeech", "accuracy": ""}, "description": "A pretrained Text-to-Speech model based on the ESPnet framework, fine-tuned on the LJSpeech dataset. This model is capable of converting text input into synthesized speech.", "model_name": "espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Denoising Diffusion Probabilistic Models (DDPM)", "api_name": "google/ddpm-ema-cat-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-ema-cat-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) is a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. It can generate high-quality images, and supports different noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm. On the unconditional CIFAR10 dataset, it achieves an Inception score of 9.46 and a state-of-the-art FID score of 3.17.", "model_name": "google/ddpm-ema-cat-256"}
{"domain": "Audio Automatic Speech Recognition", "framework": "PyTorch Transformers", "functionality": "Automatic Speech Recognition", "api_name": "data2vec-audio-base-960h", "api_call": "Data2VecForCTC.from_pretrained('facebook/data2vec-audio-base-960h')", "performance": {"dataset": "librispeech_asr", "accuracy": {"clean": 2.77, "other": 7.08}}, "description": "Facebook's Data2Vec-Audio-Base-960h model is an Automatic Speech Recognition model pretrained and fine-tuned on 960 hours of Librispeech on 16kHz sampled speech audio. It can be used for transcribing audio files and achieves competitive performance on major benchmarks of speech recognition. The model is based on the Data2Vec framework which uses the same learning method for either speech, NLP, or computer vision.", "model_name": "facebook/data2vec-audio-base-960h"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8n-csgo-player-detection", "api_call": "YOLO('keremberke/yolov8n-csgo-player-detection')", "performance": {"dataset": "csgo-object-detection", "accuracy": 0.844}, "description": "A YOLOv8 model for detecting Counter-Strike: Global Offensive (CS:GO) players with supported labels: ['ct', 'cthead', 't', 'thead'].", "model_name": "keremberke/yolov8n-csgo-player-detection"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "dslim/bert-base-NER", "api_call": "AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')", "performance": {"dataset": "conll2003", "accuracy": {"f1": 91.3, "precision": 90.7, "recall": 91.9}}, "description": "bert-base-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition and achieves state-of-the-art performance for the NER task. It has been trained to recognize four types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC). Specifically, this model is a bert-base-cased model that was fine-tuned on the English version of the standard CoNLL-2003 Named Entity Recognition dataset.", "model_name": "dslim/bert-base-NER"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8n-pcb-defect-segmentation", "api_call": "YOLO('keremberke/yolov8n-pcb-defect-segmentation')", "performance": {"dataset": "pcb-defect-segmentation", "accuracy": {"mAP@0.5(box)": 0.512, "mAP@0.5(mask)": 0.517}}, "description": "A YOLOv8 model for detecting and segmenting PCB defects such as Dry_joint, Incorrect_installation, PCB_damage, and Short_circuit.", "model_name": "keremberke/yolov8n-pcb-defect-segmentation"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "google/ncsnpp-celebahq-256", "api_call": "DiffusionPipeline.from_pretrained('google/ncsnpp-celebahq-256')", "performance": {"dataset": "CIFAR-10", "accuracy": {"Inception_score": 9.89, "FID": 2.2, "likelihood": 2.99}}, "description": "Score-Based Generative Modeling through Stochastic Differential Equations (SDE) for unconditional image generation. This model achieves record-breaking performance on CIFAR-10 and demonstrates high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.", "model_name": "google/ncsnpp-celebahq-256"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "typeform/mobilebert-uncased-mnli", "api_call": "AutoModelForSequenceClassification.from_pretrained('typeform/mobilebert-uncased-mnli')", "performance": {"dataset": "multi_nli", "accuracy": "More information needed"}, "description": "This model is the Multi-Genre Natural Language Inference (MNLI) fine-turned version of the uncased MobileBERT model. It can be used for the task of zero-shot classification.", "model_name": "typeform/mobilebert-uncased-mnli"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Emotion Classification", "api_name": "j-hartmann/emotion-english-distilroberta-base", "api_call": "pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base', return_all_scores=True)", "performance": {"dataset": "Balanced subset from 6 diverse datasets", "accuracy": "66%"}, "description": "This model classifies emotions in English text data. It predicts Ekman's 6 basic emotions, plus a neutral class: anger, disgust, fear, joy, neutral, sadness, and surprise. The model is a fine-tuned checkpoint of DistilRoBERTa-base.", "model_name": "j-hartmann/emotion-english-distilroberta-base"}
{"domain": "Tabular Tabular Classification", "framework": "Hugging Face", "functionality": "Binary Classification", "api_name": "harithapliyal/autotrain-tatanic-survival-51030121311", "api_call": "AutoModel.from_pretrained('harithapliyal/autotrain-tatanic-survival-51030121311')", "performance": {"dataset": "harithapliyal/autotrain-data-tatanic-survival", "accuracy": 0.872}, "description": "A tabular classification model trained on the Titanic survival dataset using Hugging Face AutoTrain. The model predicts whether a passenger survived or not based on features such as age, gender, and passenger class.", "model_name": "harithapliyal/autotrain-tatanic-survival-51030121311"}
{"domain": "Audio Audio Classification", "framework": "SpeechBrain", "functionality": "Emotion Recognition", "api_name": "speechbrain/emotion-recognition-wav2vec2-IEMOCAP", "api_call": "foreign_class(source='speechbrain/emotion-recognition-wav2vec2-IEMOCAP', pymodule_file='custom_interface.py', classname='CustomEncoderWav2vec2Classifier')", "performance": {"dataset": "IEMOCAP", "accuracy": "78.7%"}, "description": "This repository provides all the necessary tools to perform emotion recognition with a fine-tuned wav2vec2 (base) model using SpeechBrain. It is trained on IEMOCAP training data.", "model_name": "speechbrain/emotion-recognition-wav2vec2-IEMOCAP"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Sentiment Analysis", "api_name": "finiteautomata/bertweet-base-sentiment-analysis", "api_call": "pipeline('text-classification', model='finiteautomata/bertweet-base-sentiment-analysis')", "performance": {"dataset": "SemEval 2017", "accuracy": null}, "description": "Model trained with SemEval 2017 corpus (around ~40k tweets). Base model is BERTweet, a RoBERTa model trained on English tweets. Uses POS, NEG, NEU labels.", "model_name": "finiteautomata/bertweet-base-sentiment-analysis"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "EleutherAI/gpt-j-6B", "api_call": "AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')", "performance": {"dataset": "the_pile", "accuracy": {"LAMBADA_PPL": 3.99, "LAMBADA_Acc": "69.7%", "Winogrande": "65.3%", "Hellaswag": "66.1%", "PIQA": "76.5%"}}, "description": "GPT-J 6B is a transformer model trained using Ben Wang's Mesh Transformer JAX. It consists of 28 layers with a model dimension of 4096, and a feedforward dimension of 16384. The model dimension is split into 16 heads, each with a dimension of 256. Rotary Position Embedding (RoPE) is applied to 64 dimensions of each head. The model is trained with a tokenization vocabulary of 50257, using the same set of BPEs as GPT-2/GPT-3. GPT-J 6B was trained on the Pile, a large-scale curated dataset created by EleutherAI.", "model_name": "EleutherAI/gpt-j-6B"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-MiniLM-L3-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L3-v2')", "performance": {"dataset": "snli, multi_nli, ms_marco", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-MiniLM-L3-v2"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8n-pothole-segmentation", "api_call": "YOLO('keremberke/yolov8n-pothole-segmentation')", "performance": {"dataset": "pothole-segmentation", "accuracy": {"mAP@0.5(box)": 0.995, "mAP@0.5(mask)": 0.995}}, "description": "A YOLOv8 model for pothole segmentation in images. The model is trained on the pothole-segmentation dataset and achieves high accuracy in detecting potholes.", "model_name": "keremberke/yolov8n-pothole-segmentation"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu-finetuned-diode-230103-091356", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-230103-091356')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.436, "Mae": 0.42510000000000003, "Rmse": 0.6169, "Abs Rel": 0.45, "Log Mae": 0.1721, "Log Rmse": 0.22690000000000002, "Delta1": 0.38280000000000003, "Delta2": 0.6326, "Delta3": 0.8051}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset. It is used for depth estimation in computer vision tasks.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-230103-091356"}
{"domain": "Reinforcement Learning", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "edbeeching/decision-transformer-gym-walker2d-expert", "api_call": "AutoModel.from_pretrained('edbeeching/decision-transformer-gym-walker2d-expert')", "performance": {"dataset": "Gym Walker2d environment", "accuracy": "Not provided"}, "description": "Decision Transformer model trained on expert trajectories sampled from the Gym Walker2d environment.", "model_name": "edbeeching/decision-transformer-gym-walker2d-expert"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-base-short", "api_call": "VideoMAEForPreTraining.from_pretrained('MCG-NJU/videomae-base-short')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not provided"}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches. Videos are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds fixed sinus/cosinus position embeddings before feeding the sequence to the layers of the Transformer encoder. By pre-training the model, it learns an inner representation of videos that can then be used to extract features useful for downstream tasks.", "model_name": "MCG-NJU/videomae-base-short"}
{"domain": "Tabular Tabular Classification", "framework": "Scikit-learn", "functionality": "Wine Quality classification", "api_name": "julien-c/wine-quality", "api_call": "joblib.load(cached_download(hf_hub_url('julien-c/wine-quality', 'winequality-red.csv')))", "performance": {"dataset": "julien-c/wine-quality", "accuracy": 0.6616635397}, "description": "A Simple Example of Scikit-learn Pipeline for Wine Quality classification. Inspired by https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976 by Saptashwa Bhattacharyya.", "model_name": "julien-c/wine-quality"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Sentiment Analysis", "api_name": "bert-base-multilingual-uncased-sentiment", "api_call": "pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')", "performance": {"dataset": [{"language": "English", "accuracy": {"exact": "67%", "off-by-1": "95%"}}, {"language": "Dutch", "accuracy": {"exact": "57%", "off-by-1": "93%"}}, {"language": "German", "accuracy": {"exact": "61%", "off-by-1": "94%"}}, {"language": "French", "accuracy": {"exact": "59%", "off-by-1": "94%"}}, {"language": "Italian", "accuracy": {"exact": "59%", "off-by-1": "95%"}}, {"language": "Spanish", "accuracy": {"exact": "58%", "off-by-1": "95%"}}]}, "description": "This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish and Italian. It predicts the sentiment of the review as a number of stars (between 1 and 5).", "model_name": "nlptown/bert-base-multilingual-uncased-sentiment"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "facebook/convnext-large-224", "api_call": "ConvNextForImageClassification.from_pretrained('facebook/convnext-large-224')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not specified"}, "description": "ConvNeXT is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, that claims to outperform them. The authors started from a ResNet and 'modernized' its design by taking the Swin Transformer as inspiration.", "model_name": "facebook/convnext-large-224"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Voice Activity Detection", "api_name": "popcornell/pyannote-segmentation-chime6-mixer6", "api_call": "Model.from_pretrained('popcornell/pyannote-segmentation-chime6-mixer6')", "performance": {"dataset": "ami", "accuracy": "N/A"}, "description": "Pyannote Segmentation model fine-tuned on data from CHiME-7 DASR Challenge. Used to perform diarization in the CHiME-7 DASR diarization baseline.", "model_name": "popcornell/pyannote-segmentation-chime6-mixer6"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/table-transformer-structure-recognition", "api_call": "pipeline('object-detection', model='microsoft/table-transformer-structure-recognition')", "performance": {"dataset": "PubTables1M", "accuracy": ""}, "description": "Table Transformer (DETR) model trained on PubTables1M for detecting the structure (like rows, columns) in tables.", "model_name": "microsoft/table-transformer-structure-recognition"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Part-of-Speech Tagging", "api_name": "flair/pos-english", "api_call": "SequenceTagger.load('flair/pos-english')", "performance": {"dataset": "Ontonotes", "accuracy": "98.19"}, "description": "This is the standard part-of-speech tagging model for English that ships with Flair. It predicts fine-grained POS tags based on Flair embeddings and LSTM-CRF.", "model_name": "flair/pos-english"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "bigscience/test-bloomd-6b3", "api_call": "pipeline('text-generation', model='bigscience/test-bloomd-6b3')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "A text generation model from Hugging Face, using the bigscience/test-bloomd-6b3 architecture. It can be used for generating text based on a given input.", "model_name": "bigscience/test-bloomd-6b3"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "chinese-clip-vit-large-patch14", "api_call": "ChineseCLIPModel.from_pretrained('OFA-Sys/chinese-clip-vit-large-patch14')", "performance": {"dataset": "MUGE Text-to-Image Retrieval, Flickr30K-CN Retrieval, COCO-CN Retrieval, CIFAR10, CIFAR100, DTD, EuroSAT, FER, FGV, KITTI, MNIST, PASCAL VOC", "accuracy": "Varies depending on the dataset"}, "description": "Chinese-CLIP-ViT-Large-Patch14 is a large version of the Chinese CLIP model, with ViT-L/14 as the image encoder and RoBERTa-wwm-base as the text encoder. Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. It is designed for zero-shot image classification tasks.", "model_name": "OFA-Sys/chinese-clip-vit-large-patch14"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Detect GPT-2 generated text", "api_name": "roberta-base-openai-detector", "api_call": "pipeline('text-classification', model='roberta-base-openai-detector')", "performance": {"dataset": "WebText", "accuracy": "95%"}, "description": "RoBERTa base OpenAI Detector is the GPT-2 output detector model, obtained by fine-tuning a RoBERTa base model with the outputs of the 1.5B-parameter GPT-2 model. The model can be used to predict if text was generated by a GPT-2 model.", "model_name": "roberta-base-openai-detector"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-6.7b", "api_call": "AutoModelForCausalLM.from_pretrained('facebook/opt-6.7b', torch_dtype=torch.float16)", "performance": {"dataset": {"BookCorpus": "unknown", "CC-Stories": "unknown", "The Pile": "unknown", "Pushshift.io Reddit": "unknown", "CCNewsV2": "unknown"}, "accuracy": "unknown"}, "description": "OPT (Open Pre-trained Transformer Language Models) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters. It was trained on a large corpus of text, predominantly in English, using a causal language modeling (CLM) objective. The model can be used for prompting for evaluation of downstream tasks, text generation, and fine-tuning on a downstream task using the CLM example.", "model_name": "facebook/opt-6.7b"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Masked Language Modeling", "api_name": "roberta-base", "api_call": "pipeline('fill-mask', model='roberta-base')", "performance": {"dataset": [{"name": "MNLI", "accuracy": 87.6}, {"name": "QQP", "accuracy": 91.9}, {"name": "QNLI", "accuracy": 92.8}, {"name": "SST-2", "accuracy": 94.8}, {"name": "CoLA", "accuracy": 63.6}, {"name": "STS-B", "accuracy": 91.2}, {"name": "MRPC", "accuracy": 90.2}, {"name": "RTE", "accuracy": 78.7}]}, "description": "RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion using the Masked language modeling (MLM) objective. This model is case-sensitive and can be fine-tuned on a downstream task.", "model_name": "roberta-base"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "sentence-transformers/distilbert-base-nli-mean-tokens", "api_call": "SentenceTransformer('sentence-transformers/distilbert-base-nli-mean-tokens')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/distilbert-base-nli-mean-tokens"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "git-large-coco", "api_call": "GenerativeImage2TextModel.from_pretrained('microsoft/git-large-coco')", "performance": {"dataset": "COCO", "accuracy": "See table 11 in the paper for more details."}, "description": "GIT (short for GenerativeImage2Text) model, large-sized version, fine-tuned on COCO. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository.", "model_name": "microsoft/git-large-coco"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "myunus1/diffmodels_galaxies_scratchbook", "api_call": "DDPMPipeline.from_pretrained('myunus1/diffmodels_galaxies_scratchbook')", "performance": {"dataset": "Not provided", "accuracy": "Not provided"}, "description": "This model is a diffusion model for unconditional image generation of cute \ud83e\udd8b.", "model_name": "myunus1/diffmodels_galaxies_scratchbook"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "distilgpt2", "api_call": "pipeline('text-generation', model='distilgpt2')", "performance": {"dataset": "WikiText-103", "accuracy": "21.100"}, "description": "DistilGPT2 is an English-language model pre-trained with the supervision of the 124 million parameter version of GPT-2. With 82 million parameters, it was developed using knowledge distillation and designed to be a faster, lighter version of GPT-2. It can be used for text generation, writing assistance, creative writing, entertainment, and more.", "model_name": "distilgpt2"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "microsoft/deberta-v2-xlarge", "api_call": "DebertaModel.from_pretrained('microsoft/deberta-v2-xlarge')", "performance": {"dataset": [{"name": "SQuAD 1.1", "accuracy": "95.8/90.8"}, {"name": "SQuAD 2.0", "accuracy": "91.4/88.9"}, {"name": "MNLI-m/mm", "accuracy": "91.7/91.6"}, {"name": "SST-2", "accuracy": "97.5"}, {"name": "QNLI", "accuracy": "95.8"}, {"name": "CoLA", "accuracy": "71.1"}, {"name": "RTE", "accuracy": "93.9"}, {"name": "MRPC", "accuracy": "92.0/94.2"}, {"name": "QQP", "accuracy": "92.3/89.8"}, {"name": "STS-B", "accuracy": "92.9/92.9"}]}, "description": "DeBERTa improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. It outperforms BERT and RoBERTa on majority of NLU tasks with 80GB training data. This is the DeBERTa V2 xlarge model with 24 layers, 1536 hidden size. The total parameters are 900M and it is trained with 160GB raw data.", "model_name": "microsoft/deberta-v2-xlarge"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "google/vit-base-patch16-384", "api_call": "ViTForImageClassification.from_pretrained('google/vit-base-patch16-384')", "performance": {"dataset": "ImageNet", "accuracy": "Refer to tables 2 and 5 of the original paper"}, "description": "Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 384x384. It was introduced in the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale by Dosovitskiy et al. Images are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder.", "model_name": "google/vit-base-patch16-384"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "hf-tiny-model-private/tiny-random-ViltForQuestionAnswering", "api_call": "ViltForQuestionAnswering.from_pretrained('hf-tiny-model-private/tiny-random-ViltForQuestionAnswering')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random model for Visual Question Answering using the VILT framework.", "model_name": "hf-tiny-model-private/tiny-random-ViltForQuestionAnswering"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "layoutlmv2-base-uncased-finetuned-infovqa", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('tiennvcs/layoutlmv2-base-uncased-finetuned-infovqa')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 2.087}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-base-uncased on an unknown dataset.", "model_name": "tiennvcs/layoutlmv2-base-uncased-finetuned-infovqa"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "google/pix2struct-base", "api_call": "Pix2StructForConditionalGeneration.from_pretrained('google/pix2struct-base')", "performance": {"dataset": [{"name": "Documents", "accuracy": "N/A"}, {"name": "Illustrations", "accuracy": "N/A"}, {"name": "User Interfaces", "accuracy": "N/A"}, {"name": "Natural Images", "accuracy": "N/A"}]}, "description": "Pix2Struct is an image encoder - text decoder model that is trained on image-text pairs for various tasks, including image captioning and visual question answering. The model is pretrained by learning to parse masked screenshots of web pages into simplified HTML. It can achieve state-of-the-art results in six out of nine tasks across four domains: documents, illustrations, user interfaces, and natural images.", "model_name": "google/pix2struct-base"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "kredor/punctuate-all", "api_call": "pipeline('token-classification', model='kredor/punctuate-all')", "performance": {"dataset": "multilingual", "accuracy": 0.98}, "description": "A finetuned xlm-roberta-base model for punctuation prediction on twelve languages: English, German, French, Spanish, Bulgarian, Italian, Polish, Dutch, Czech, Portugese, Slovak, Slovenian.", "model_name": "kredor/punctuate-all"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/multi-qa-mpnet-base-dot-v1", "api_call": "SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')", "performance": {"dataset": [{"name": "WikiAnswers", "accuracy": 77427422}, {"name": "PAQ", "accuracy": 64371441}, {"name": "Stack Exchange", "accuracy": 25316456}]}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M (question, answer) pairs from diverse sources.", "model_name": "sentence-transformers/multi-qa-mpnet-base-dot-v1"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "facebook/dino-vitb16", "api_call": "ViTModel.from_pretrained('facebook/dino-vitb16')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "Vision Transformer (ViT) model trained using the DINO method. The model is pretrained on a large collection of images in a self-supervised fashion, namely ImageNet-1k, at a resolution of 224x224 pixels. Images are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder. Note that this model does not include any fine-tuned heads.", "model_name": "facebook/dino-vitb16"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Feature Extraction", "api_name": "facebook/dpr-question_encoder-single-nq-base", "api_call": "DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')", "performance": {"dataset": [{"name": "NQ", "accuracy": {"top_20": 78.4, "top_100": 85.4}}, {"name": "TriviaQA", "accuracy": {"top_20": 79.4, "top_100": 85.0}}, {"name": "WQ", "accuracy": {"top_20": 73.2, "top_100": 81.4}}, {"name": "TREC", "accuracy": {"top_20": 79.8, "top_100": 89.1}}, {"name": "SQuAD", "accuracy": {"top_20": 63.2, "top_100": 77.2}}]}, "description": "Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. dpr-question_encoder-single-nq-base is the question encoder trained using the Natural Questions (NQ) dataset (Lee et al., 2019; Kwiatkowski et al., 2019).", "model_name": "facebook/dpr-question_encoder-single-nq-base"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Language Detection", "api_name": "papluca/xlm-roberta-base-language-detection", "api_call": "pipeline('text-classification', model='papluca/xlm-roberta-base-language-detection')", "performance": {"dataset": "Language Identification", "accuracy": 0.996}, "description": "This model is a fine-tuned version of xlm-roberta-base on the Language Identification dataset. It is an XLM-RoBERTa transformer model with a classification head on top, and can be used as a language detector for sequence classification tasks. It supports 20 languages including Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Italian, Japanese, Dutch, Polish, Portuguese, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, and Chinese.", "model_name": "papluca/xlm-roberta-base-language-detection"}
{"domain": "Reinforcement Learning Robotics", "framework": "Hugging Face", "functionality": "Inference API", "api_name": "Antheia/Hanna", "api_call": "pipeline('robotics', model='Antheia/Hanna')", "performance": {"dataset": "openai/webgpt_comparisons", "accuracy": ""}, "description": "Antheia/Hanna is a reinforcement learning model for robotics tasks, trained on the openai/webgpt_comparisons dataset.", "model_name": "Antheia/Hanna"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "princeton-nlp/unsup-simcse-roberta-base", "api_call": "AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')", "performance": {"dataset": null, "accuracy": null}, "description": "An unsupervised sentence embedding model trained using the SimCSE approach with a Roberta base architecture.", "model_name": "princeton-nlp/unsup-simcse-roberta-base"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-mpnet-base-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model that maps sentences and paragraphs to a 768-dimensional dense vector space. It can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-mpnet-base-v2"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Image generation and modification based on text prompts", "api_name": "stabilityai/stable-diffusion-2-inpainting", "api_call": "StableDiffusionInpaintPipeline.from_pretrained('stabilityai/stable-diffusion-2-inpainting', torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "A Latent Diffusion Model that uses a fixed, pretrained text encoder (OpenCLIP-ViT/H) to generate and modify images based on text prompts.", "model_name": "stabilityai/stable-diffusion-2-inpainting"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-sv-en", "api_call": "AutoModel.from_pretrained('Helsinki-NLP/opus-mt-sv-en').", "performance": {"dataset": "Tatoeba.sv.en", "accuracy": "BLEU: 64.5, chr-F: 0.763"}, "description": "A Swedish to English translation model trained on the OPUS dataset using the transformer-align architecture. The model is pre-processed with normalization and SentencePiece.", "model_name": "Helsinki-NLP/opus-mt-sv-en"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "ocariz/butterfly_200", "api_call": "DDPMPipeline.from_pretrained('ocariz/butterfly_200')", "performance": {"dataset": "", "accuracy": ""}, "description": "This model is a diffusion model for unconditional image generation of cute butterflies trained for 200 epochs.", "model_name": "ocariz/butterfly_200"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "api_call": "SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')", "performance": {"dataset": "215M (question, answer) pairs from diverse sources", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model that maps sentences and paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M (question, answer) pairs from diverse sources.", "model_name": "sentence-transformers/multi-qa-mpnet-base-cos-v1"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Denoising Diffusion Probabilistic Models (DDPM)", "api_name": "google/ddpm-ema-bedroom-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-ema-bedroom-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) is a class of latent variable models inspired by nonequilibrium thermodynamics, capable of producing high-quality image synthesis results. The model can use discrete noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm for inference. It obtains an Inception score of 9.46 and a state-of-the-art FID score of 3.17 on the unconditional CIFAR10 dataset.", "model_name": "google/ddpm-ema-bedroom-256"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221121-063504", "api_call": "AutoModelForImageClassification.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221121-063504')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.3533, "Mae": 0.26680000000000004, "Rmse": 0.37160000000000004, "Abs Rel": 0.3427, "Log Mae": 0.11670000000000001, "Log Rmse": 0.1703, "Delta1": 0.5522, "Delta2": 0.8362, "Delta3": 0.9382}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset for depth estimation.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221121-063504"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "dandelin/vilt-b32-finetuned-vqa", "api_call": "ViltForQuestionAnswering.from_pretrained('dandelin/vilt-b32-finetuned-vqa')", "performance": {"dataset": "VQAv2", "accuracy": "to do"}, "description": "Vision-and-Language Transformer (ViLT) model fine-tuned on VQAv2. It was introduced in the paper ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision by Kim et al. and first released in this repository.", "model_name": "dandelin/vilt-b32-finetuned-vqa"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Blood Cell Detection", "api_name": "keremberke/yolov8n-blood-cell-detection", "api_call": "YOLO('keremberke/yolov8n-blood-cell-detection')", "performance": {"dataset": "blood-cell-object-detection", "accuracy": 0.893}, "description": "This model detects blood cells in images, specifically Platelets, RBC, and WBC. It is based on the YOLOv8 architecture and trained on the blood-cell-object-detection dataset.", "model_name": "keremberke/yolov8n-blood-cell-detection"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "EleutherAI/gpt-neo-2.7B", "api_call": "pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B')", "performance": {"dataset": "the_pile", "accuracy": {"Lambada_Acc": "62.22%", "Winogrande": "56.50%", "Hellaswag": "42.73%"}}, "description": "GPT-Neo 2.7B is a transformer model designed using EleutherAI's replication of the GPT-3 architecture. It was trained on the Pile, a large scale curated dataset created by EleutherAI for the purpose of training this model. This model is best suited for generating texts from a prompt and can be used directly with a pipeline for text generation.", "model_name": "EleutherAI/gpt-neo-2.7B"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Speech Enhancement", "api_name": "speechbrain/sepformer-whamr-enhancement", "api_call": "separator.from_hparams(source='speechbrain/sepformer-whamr-enhancement', savedir='pretrained_models/sepformer-whamr-enhancement')", "performance": {"dataset": "WHAMR!", "accuracy": "10.59 dB SI-SNR"}, "description": "This repository provides all the necessary tools to perform speech enhancement (denoising + dereverberation) with a SepFormer model, implemented with SpeechBrain, and pretrained on WHAMR! dataset with 8k sampling frequency, which is basically a version of WSJ0-Mix dataset with environmental noise and reverberation in 8k.", "model_name": "speechbrain/sepformer-whamr-enhancement"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "moussaKam/barthez-orangesum-abstract", "api_call": "BarthezModel.from_pretrained('moussaKam/barthez-orangesum-abstract')", "performance": {"dataset": "orangeSum", "accuracy": ""}, "description": "Barthez model finetuned on orangeSum for abstract generation in French language", "model_name": "moussaKam/barthez-orangesum-abstract"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221122-030603", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-nyu-finetuned-diode-221122-030603')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.3597, "Mae": 0.3054, "Rmse": 0.4481, "Abs Rel": 0.3462, "Log Mae": 0.12560000000000002, "Log Rmse": 0.17980000000000002, "Delta1": 0.5278, "Delta2": 0.8055, "Delta3": 0.9191}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221122-030603"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "speech-to-speech-translation", "api_name": "facebook/textless_sm_en_fr", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/textless_sm_en_fr')", "performance": {"dataset": "", "accuracy": ""}, "description": "This model is a speech-to-speech translation model trained by Facebook. It is designed for translating English speech to French speech.", "model_name": "facebook/textless_sm_en_fr"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text2Text Generation", "api_name": "google/byt5-small", "api_call": "T5ForConditionalGeneration.from_pretrained('google/byt5-small')", "performance": {"dataset": "mc4", "accuracy": "Not provided"}, "description": "ByT5 is a tokenizer-free version of Google's T5 and generally follows the architecture of MT5. ByT5 was only pre-trained on mC4 excluding any supervised training with an average span-mask of 20 UTF-8 characters. Therefore, this model has to be fine-tuned before it is usable on a downstream task. ByT5 works especially well on noisy text data, e.g., google/byt5-small significantly outperforms mt5-small on TweetQA.", "model_name": "google/byt5-small"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "AICVTG_What_if_a_machine_could_create_captions_automatically", "api_call": "VisionEncoderDecoderModel.from_pretrained('facebook/mmt-en-de')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "This is an image captioning model training by Zayn", "model_name": "facebook/mmt-en-de"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8m-building-segmentation", "api_call": "YOLO('keremberke/yolov8m-building-segmentation')", "performance": {"dataset": "satellite-building-segmentation", "accuracy": {"mAP@0.5(box)": 0.623, "mAP@0.5(mask)": 0.613}}, "description": "A YOLOv8 model for building segmentation in satellite images. It can detect and segment buildings in the input images.", "model_name": "keremberke/yolov8m-building-segmentation"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "CartPole-v1", "api_name": "dqn-CartPole-v1", "api_call": "load_from_hub(repo_id='sb3/dqn-CartPole-v1',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "CartPole-v1", "accuracy": "500.00 +/- 0.00"}, "description": "This is a trained model of a DQN agent playing CartPole-v1 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/dqn-CartPole-v1"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "johnowhitaker/sd-class-wikiart-from-bedrooms", "api_call": "DDPMPipeline.from_pretrained('johnowhitaker/sd-class-wikiart-from-bedrooms')", "performance": {"dataset": "https://huggingface.co/datasets/huggan/wikiart", "accuracy": "Not provided"}, "description": "This model is a diffusion model initialized from https://huggingface.co/google/ddpm-bedroom-256 and trained for 5000 steps on https://huggingface.co/datasets/huggan/wikiart.", "model_name": "johnowhitaker/sd-class-wikiart-from-bedrooms"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/maskformer-swin-tiny-coco", "api_call": "MaskFormerForInstanceSegmentation.from_pretrained('facebook/maskformer-swin-tiny-coco')", "performance": {"dataset": "COCO panoptic segmentation", "accuracy": "Not provided"}, "description": "MaskFormer model trained on COCO panoptic segmentation (tiny-sized version, Swin backbone). It was introduced in the paper Per-Pixel Classification is Not All You Need for Semantic Segmentation and first released in this repository.", "model_name": "facebook/maskformer-swin-tiny-coco"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "skops", "api_name": "rajistics/MAPIE-TS-Electricity", "api_call": "RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59)", "performance": {"dataset": "", "accuracy": ""}, "description": "A RandomForestRegressor model for electricity consumption prediction.", "model_name": "RandomForestRegressor(max_depth=10, n_estimators=50, random_state=59)"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "prompthero/openjourney", "api_call": "StableDiffusionPipeline.from_pretrained('prompthero/openjourney', torch_dtype=torch.float16)", "performance": {"dataset": "Midjourney images", "accuracy": "Not specified"}, "description": "Openjourney is an open source Stable Diffusion fine-tuned model on Midjourney images, by PromptHero. It can be used for generating AI art based on text prompts.", "model_name": "prompthero/openjourney"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "facebook/bart-large", "api_call": "BartModel.from_pretrained('facebook/bart-large')", "performance": {"dataset": "arxiv", "accuracy": "Not provided"}, "description": "BART is a transformer encoder-decoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering).", "model_name": "facebook/bart-large"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "convnextv2_huge.fcmae_ft_in1k", "api_call": "timm.create_model('convnextv2_huge.fcmae_ft_in1k', pretrained=True)", "performance": {"dataset": "imagenet-1k", "accuracy": 86.256}, "description": "A ConvNeXt-V2 image classification model. Pretrained with a fully convolutional masked autoencoder framework (FCMAE) and fine-tuned on ImageNet-1k.", "model_name": "timm/convnextv2_huge.fcmae_ft_in1k"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Summarization", "api_name": "pszemraj/long-t5-tglobal-base-16384-book-summary", "api_call": "T5ForConditionalGeneration.from_pretrained('pszemraj/long-t5-tglobal-base-16384-book-summary')", "performance": {"dataset": "kmfoda/booksum", "accuracy": {"ROUGE-1": 36.408, "ROUGE-2": 6.065, "ROUGE-L": 16.721, "ROUGE-LSUM": 33.34}}, "description": "A fine-tuned version of google/long-t5-tglobal-base on the kmfoda/booksum dataset, which can be used to summarize long text and generate SparkNotes-esque summaries of arbitrary topics. The model generalizes reasonably well to academic and narrative text.", "model_name": "pszemraj/long-t5-tglobal-base-16384-book-summary"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "dreamlike-art/dreamlike-diffusion-1.0", "api_call": "StableDiffusionPipeline.from_pretrained('dreamlike-art/dreamlike-diffusion-1.0', torch_dtype=torch.float16)", "performance": {"dataset": "high quality art", "accuracy": "not provided"}, "description": "Dreamlike Diffusion 1.0 is SD 1.5 fine tuned on high quality art, made by dreamlike.art.", "model_name": "dreamlike-art/dreamlike-diffusion-1.0"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face Transformers", "functionality": "Image Super-Resolution", "api_name": "caidas/swin2SR-classical-sr-x2-64", "api_call": "Swin2SRForImageSuperResolution.from_pretrained('caidas/swin2sr-classical-sr-x2-64')", "performance": {"dataset": "arxiv: 2209.11345", "accuracy": "Not provided"}, "description": "Swin2SR model that upscales images x2. It was introduced in the paper Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration by Conde et al. and first released in this repository.", "model_name": "caidas/swin2sr-classical-sr-x2-64"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "videomae-small-finetuned-kinetics", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-small-finetuned-kinetics')", "performance": {"dataset": "Kinetics-400", "accuracy": {"top-1": 79.0, "top-5": 93.8}}, "description": "VideoMAE model pre-trained for 1600 epochs in a self-supervised way and fine-tuned in a supervised way on Kinetics-400. It was introduced in the paper VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training by Tong et al. and first released in this repository.", "model_name": "MCG-NJU/videomae-small-finetuned-kinetics"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000mp4-epochs8-batch8-kb", "api_call": "AutoModelForVideoClassification.from_pretrained('lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000mp4-epochs8-batch8-kb')", "performance": {"dataset": "unknown", "accuracy": 0.7453000000000001}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base-finetuned-kinetics on an unknown dataset.", "model_name": "lmazzon70/videomae-base-finetuned-kinetics-finetuned-rwf2000mp4-epochs8-batch8-kb"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/paraphrase-distilroberta-base-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-distilroberta-base-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-distilroberta-base-v2"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english-ontonotes-fast", "api_call": "SequenceTagger.load('flair/ner-english-ontonotes-fast')", "performance": {"dataset": "Ontonotes", "accuracy": "F1-Score: 89.3"}, "description": "This is the fast version of the 18-class NER model for English that ships with Flair. It predicts 18 tags such as cardinal value, date value, event name, building name, geo-political entity, language name, law name, location name, money name, affiliation, ordinal value, organization name, percent value, person name, product name, quantity value, time value, and name of work of art. The model is based on Flair embeddings and LSTM-CRF.", "model_name": "flair/ner-english-ontonotes-fast"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "google/mobilenet_v2_1.0_224", "api_call": "AutoModelForImageClassification.from_pretrained('google/mobilenet_v2_1.0_224')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not specified"}, "description": "MobileNet V2 model pre-trained on ImageNet-1k at resolution 224x224. It was introduced in MobileNetV2: Inverted Residuals and Linear Bottlenecks by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as Inception, are used. MobileNets can be run efficiently on mobile devices.", "model_name": "google/mobilenet_v2_1.0_224"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling Prediction", "api_name": "CodeBERTa-small-v1", "api_call": "pipeline('fill-mask', model='huggingface/CodeBERTa-small-v1')", "performance": {"dataset": "code_search_net", "accuracy": null}, "description": "CodeBERTa is a RoBERTa-like model trained on the CodeSearchNet dataset from GitHub. It supports languages like Go, Java, JavaScript, PHP, Python, and Ruby. The tokenizer is a Byte-level BPE tokenizer trained on the corpus using Hugging Face tokenizers. The small model is a 6-layer, 84M parameters, RoBERTa-like Transformer model.", "model_name": "huggingface/CodeBERTa-small-v1"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "deep-reinforcement-learning", "api_name": "td3-Ant-v3", "api_call": "load_from_hub(repo_id='sb3/td3-Ant-v3,filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "Ant-v3", "accuracy": "5822.96 +/- 93.33"}, "description": "This is a trained model of a TD3 agent playing Ant-v3 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/td3-Ant-v3"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "sshleifer/tiny-marian-en-de", "api_call": "pipeline('translation_en_to_de', model='sshleifer/tiny-marian-en-de')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny English to German translation model using the Marian framework in Hugging Face Transformers.", "model_name": "sshleifer/tiny-marian-en-de"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-zh-en", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-zh-en')", "performance": {"dataset": "opus", "accuracy": {"BLEU": 36.1, "chr-F": 0.548}}, "description": "A Chinese to English translation model developed by the Language Technology Research Group at the University of Helsinki. It is based on the Marian NMT framework and trained on the OPUS dataset.", "model_name": "Helsinki-NLP/opus-mt-zh-en"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tiny-random-VideoMAEForVideoClassification", "api_call": "VideoClassificationPipeline(model='hf-tiny-model-private/tiny-random-VideoMAEForVideoClassification')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random VideoMAE model for video classification.", "model_name": "hf-tiny-model-private/tiny-random-VideoMAEForVideoClassification"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-base-short-finetuned-kinetics", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-base-short-finetuned-kinetics')", "performance": {"dataset": "Kinetics-400", "accuracy": {"top-1": 79.4, "top-5": 94.1}}, "description": "VideoMAE model pre-trained for 800 epochs in a self-supervised way and fine-tuned in a supervised way on Kinetics-400. It was introduced in the paper VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training by Tong et al. and first released in this repository.", "model_name": "MCG-NJU/videomae-base-short-finetuned-kinetics"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "google/ddpm-ema-church-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-ema-church-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception score": 9.46, "FID score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) is a class of latent variable models inspired by nonequilibrium thermodynamics. It is used for high-quality image synthesis. DDPM models can use discrete noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm for inference. The model can be used with different pipelines for faster inference and better trade-off between quality and speed.", "model_name": "google/ddpm-ema-church-256"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-large", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('microsoft/tapex-large')", "performance": {"dataset": "", "accuracy": ""}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries. TAPEX is based on the BART architecture, the transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder.", "model_name": "microsoft/tapex-large"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english-ontonotes", "api_call": "SequenceTagger.load('flair/ner-english-ontonotes')", "performance": {"dataset": "Ontonotes", "accuracy": "89.27"}, "description": "This is the 18-class NER model for English that ships with Flair. It predicts 18 tags such as cardinal value, date value, event name, building name, geo-political entity, language name, law name, location name, money name, affiliation, ordinal value, organization name, percent value, person name, product name, quantity value, time value, and name of work of art. Based on Flair embeddings and LSTM-CRF.", "model_name": "flair/ner-english-ontonotes"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "OFA-Sys/chinese-clip-vit-large-patch14-336px", "api_call": "ChineseCLIPModel.from_pretrained('OFA-Sys/chinese-clip-vit-large-patch14-336px')", "performance": {"dataset": {"CIFAR10": 96.0, "CIFAR100": 79.75, "DTD": 51.2, "EuroSAT": 52.0, "FER": 55.1, "FGVC": 26.2, "KITTI": 49.9, "MNIST": 79.4, "PC": 63.5, "VOC": 84.9}, "accuracy": "various"}, "description": "Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. It uses ViT-L/14@336px as the image encoder and RoBERTa-wwm-base as the text encoder.", "model_name": "OFA-Sys/chinese-clip-vit-large-patch14-336px"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221122-044810", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-nyu-finetuned-diode-221122-044810')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.369, "Mae": 0.2909, "Rmse": 0.4208, "Abs Rel": 0.3635, "Log Mae": 0.12240000000000001, "Log Rmse": 0.17930000000000001, "Delta1": 0.5323, "Delta2": 0.8179000000000001, "Delta3": 0.9258000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221122-044810"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Grammar Correction", "api_name": "vennify/t5-base-grammar-correction", "api_call": "HappyTextToText('T5', 'vennify/t5-base-grammar-correction')", "performance": {"dataset": "jfleg", "accuracy": "Not provided"}, "description": "This model generates a revised version of inputted text with the goal of containing fewer grammatical errors. It was trained with Happy Transformer using a dataset called JFLEG.", "model_name": "vennify/t5-base-grammar-correction"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Visual Question Answering", "api_name": "temp_vilt_vqa", "api_call": "pipeline('visual-question-answering', model='Bingsu/temp_vilt_vqa', tokenizer='Bingsu/temp_vilt_vqa')", "performance": {"dataset": "", "accuracy": ""}, "description": "A visual question answering model for answering questions related to images using the Hugging Face Transformers library.", "model_name": "Bingsu/temp_vilt_vqa"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-large-finetuned-wtq", "api_call": "pipeline('table-question-answering', model='google/tapas-large-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": 0.5097}, "description": "TAPAS large model fine-tuned on WikiTable Questions (WTQ). This model was pre-trained on MLM and an additional step which the authors call intermediate pre-training, and then fine-tuned in a chain on SQA, WikiSQL and finally WTQ. It uses relative position embeddings (i.e. resetting the position index at every cell of the table).", "model_name": "google/tapas-large-finetuned-wtq"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "mattmdjaga/segformer_b2_clothes", "api_call": "SegformerForSemanticSegmentation.from_pretrained('mattmdjaga/segformer_b2_clothes')", "performance": {"dataset": "mattmdjaga/human_parsing_dataset", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on ATR dataset for clothes segmentation.", "model_name": "mattmdjaga/segformer_b2_clothes"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "Kirili4ik/mbart_ruDialogSum", "api_call": "MBartForConditionalGeneration.from_pretrained('Kirili4ik/mbart_ruDialogSum')", "performance": {"dataset": [{"name": "SAMSum Corpus (translated to Russian)", "accuracy": {"Validation ROGUE-1": 34.5, "Validation ROGUE-L": 33, "Test ROGUE-1": 31, "Test ROGUE-L": 28}}]}, "description": "MBart for Russian summarization fine-tuned for dialogues summarization. This model was firstly fine-tuned by Ilya Gusev on Gazeta dataset. We have fine tuned that model on SamSum dataset translated to Russian using GoogleTranslateAPI. Moreover! We have implemented a ! telegram bot @summarization_bot ! with the inference of this model. Add it to the chat and get summaries instead of dozens spam messages!", "model_name": "Kirili4ik/mbart_ruDialogSum"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/mask2former-swin-base-coco-panoptic", "api_call": "Mask2FormerForUniversalSegmentation.from_pretrained('facebook/mask2former-swin-base-coco-panoptic')", "performance": {"dataset": "COCO panoptic segmentation", "accuracy": null}, "description": "Mask2Former model trained on COCO panoptic segmentation (base-sized version, Swin backbone). It was introduced in the paper Masked-attention Mask Transformer for Universal Image Segmentation and first released in this repository. Mask2Former addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. Hence, all 3 tasks are treated as if they were instance segmentation. Mask2Former outperforms the previous SOTA, MaskFormer both in terms of performance an efficiency.", "model_name": "facebook/mask2former-swin-base-coco-panoptic"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video", "api_name": "duncan93/video", "api_call": "BaseModel.from_pretrained('duncan93/video')", "performance": {"dataset": "OpenAssistant/oasst1", "accuracy": ""}, "description": "A text-to-video model trained on OpenAssistant/oasst1 dataset.", "model_name": "duncan93/video"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "xlnet-base-cased", "api_call": "XLNetModel.from_pretrained('xlnet-base-cased')", "performance": {"dataset": "bookcorpus, wikipedia", "accuracy": "state-of-the-art (SOTA) results on various downstream language tasks"}, "description": "XLNet model pre-trained on English language. It was introduced in the paper XLNet: Generalized Autoregressive Pretraining for Language Understanding by Yang et al. and first released in this repository. XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context.", "model_name": "xlnet-base-cased"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Diffusion Models", "api_name": "lllyasviel/control_v11p_sd15_mlsd", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_mlsd')", "performance": {"dataset": "MLSD", "accuracy": "Not provided"}, "description": "Controlnet v1.1 is a neural network structure to control diffusion models by adding extra conditions. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5. This checkpoint corresponds to the ControlNet conditioned on MLSD images.", "model_name": "lllyasviel/control_v11p_sd15_mlsd"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Sentiment Analysis", "api_name": "michellejieli/NSFW_text_classifier", "api_call": "pipeline('sentiment-analysis', model='michellejieli/NSFW_text_classification')", "performance": {"dataset": "Reddit posts", "accuracy": "Not specified"}, "description": "DistilBERT is a transformer model that performs sentiment analysis. I fine-tuned the model on Reddit posts with the purpose of classifying not safe for work (NSFW) content, specifically text that is considered inappropriate and unprofessional. The model predicts 2 classes, which are NSFW or safe for work (SFW). The model is a fine-tuned version of DistilBERT. It was fine-tuned on 14317 Reddit posts pulled from the Reddit API.", "model_name": "michellejieli/NSFW_text_classification"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "dslim/bert-base-NER-uncased", "api_call": "pipeline('ner', model='dslim/bert-base-NER-uncased')", "performance": {"dataset": "", "accuracy": ""}, "description": "A pretrained BERT model for Named Entity Recognition (NER) on uncased text. It can be used to extract entities such as person names, locations, and organizations from text.", "model_name": "dslim/bert-base-NER-uncased"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "bigwiz83/sapbert-from-pubmedbert-squad2", "api_call": "pipeline('question-answering', model='bigwiz83/sapbert-from-pubmedbert-squad2')", "performance": {"dataset": "squad_v2", "accuracy": "1.2582"}, "description": "This model is a fine-tuned version of cambridgeltl/SapBERT-from-PubMedBERT-fulltext on the squad_v2 dataset.", "model_name": "bigwiz83/sapbert-from-pubmedbert-squad2"}
{"domain": "Reinforcement Learning", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "edbeeching/decision-transformer-gym-halfcheetah-expert", "api_call": "AutoModel.from_pretrained('edbeeching/decision-transformer-gym-halfcheetah-expert')", "performance": {"dataset": "Gym HalfCheetah environment", "accuracy": "Not specified"}, "description": "Decision Transformer model trained on expert trajectories sampled from the Gym HalfCheetah environment", "model_name": "edbeeching/decision-transformer-gym-halfcheetah-expert"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/git-base-textvqa", "api_call": "AutoModel.from_pretrained('microsoft/git-base-textvqa')", "performance": {"dataset": "TextVQA", "accuracy": "Refer to the paper"}, "description": "GIT (GenerativeImage2Text), base-sized, fine-tuned on TextVQA. It is a Transformer decoder conditioned on both CLIP image tokens and text tokens. The model is trained using 'teacher forcing' on a lot of (image, text) pairs. The goal for the model is to predict the next text token, giving the image tokens and previous text tokens. It can be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification.", "model_name": "microsoft/git-base-textvqa"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Normal Map Estimation", "api_name": "lllyasviel/sd-controlnet-normal", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-normal')", "performance": {"dataset": "DIODE", "accuracy": "Not provided"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Normal Map Estimation. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-normal"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "google/ddpm-celebahq-256", "api_call": "DDPMPipeline.from_pretrained('ddpm-celebahq-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) for high quality image synthesis. Trained on the unconditional CIFAR10 dataset and 256x256 LSUN, obtaining state-of-the-art FID score of 3.17 and Inception score of 9.46.", "model_name": "ddpm-celebahq-256"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "lysandre/tiny-tapas-random-sqa", "api_call": "TapasForCovid.from_pretrained('lysandre/tiny-tapas-random-sqa')", "performance": {"dataset": null, "accuracy": null}, "description": "A tiny TAPAS model for table question answering tasks.", "model_name": "lysandre/tiny-tapas-random-sqa"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "nvidia/segformer-b5-finetuned-ade-640-640", "api_call": "SegformerForSemanticSegmentation.from_pretrained('nvidia/segformer-b5-finetuned-ade-640-640')", "performance": {"dataset": "ADE20K", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on ADE20k at resolution 640x640. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository.", "model_name": "nvidia/segformer-b5-finetuned-ade-640-640"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "distilbert-base-uncased-distilled-squad", "api_call": "pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')", "performance": {"dataset": "SQuAD v1.1", "accuracy": "86.9 F1 score"}, "description": "DistilBERT base uncased distilled SQuAD is a fine-tuned version of DistilBERT-base-uncased for the task of question answering. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark.", "model_name": "distilbert-base-uncased-distilled-squad"}
{"domain": "Tabular Tabular Regression", "framework": "Keras", "functionality": "anomaly-detection", "api_name": "keras-io/timeseries-anomaly-detection", "api_call": "TFAutoModelForSequenceClassification.from_pretrained('keras-io/timeseries-anomaly-detection')", "performance": {"dataset": "Numenta Anomaly Benchmark(NAB)", "accuracy": {"Train Loss": 0.006, "Validation Loss": 0.008}}, "description": "This script demonstrates how you can use a reconstruction convolutional autoencoder model to detect anomalies in timeseries data. We will use the Numenta Anomaly Benchmark(NAB) dataset. It provides artifical timeseries data containing labeled anomalous periods of behavior. Data are ordered, timestamped, single-valued metrics.", "model_name": "keras-io/timeseries-anomaly-detection"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Fill-Mask", "api_name": "cl-tohoku/bert-base-japanese-whole-word-masking", "api_call": "AutoModelForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')", "performance": {"dataset": "Japanese Wikipedia", "accuracy": "Not provided"}, "description": "This is a BERT model pretrained on texts in the Japanese language. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.", "model_name": "cl-tohoku/bert-base-japanese-whole-word-masking"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "DeepPavlov/rubert-base-cased", "api_call": "AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')", "performance": {"dataset": "Russian part of Wikipedia and news data", "accuracy": ""}, "description": "RuBERT (Russian, cased, 12\u2011layer, 768\u2011hidden, 12\u2011heads, 180M parameters) was trained on the Russian part of Wikipedia and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version of BERT\u2011base as an initialization for RuBERT[1].", "model_name": "DeepPavlov/rubert-base-cased"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Many-to-Many multilingual translation", "api_name": "facebook/m2m100_1.2B", "api_call": "M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_1.2B')", "performance": {"dataset": "M2M100", "accuracy": "Not specified"}, "description": "M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It can directly translate between the 9,900 directions of 100 languages. To translate into a target language, the target language id is forced as the first generated token.", "model_name": "facebook/m2m100_1.2B"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "superb/hubert-base-superb-ks", "api_call": "pipeline('audio-classification', model='superb/hubert-base-superb-ks')", "performance": {"dataset": "Speech Commands dataset v1.0", "accuracy": 0.9672000000000001}, "description": "This is a ported version of S3PRL's Hubert for the SUPERB Keyword Spotting task. The base model is hubert-base-ls960, which is pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. For more information refer to SUPERB: Speech processing Universal PERformance Benchmark.", "model_name": "superb/hubert-base-superb-ks"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "andite/anything-v4.0", "api_call": "StableDiffusionPipeline.from_pretrained('andite/anything-v4.0', torch_dtype=torch.float16)", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "Anything V4 is a latent diffusion model for generating high-quality, highly detailed anime-style images with just a few prompts. It supports danbooru tags to generate images and can be used just like any other Stable Diffusion model.", "model_name": "andite/anything-v4.0"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "hakurei/waifu-diffusion", "api_call": "StableDiffusionPipeline.from_pretrained('hakurei/waifu-diffusion', torch_dtype=torch.float32)", "performance": {"dataset": "high-quality anime images", "accuracy": "not available"}, "description": "waifu-diffusion is a latent text-to-image diffusion model that has been conditioned on high-quality anime images through fine-tuning.", "model_name": "hakurei/waifu-diffusion"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Fill-Mask", "api_name": "camembert-base", "api_call": "pipeline('fill-mask', model='camembert-base', tokenizer='camembert-base')", "performance": {"dataset": "oscar", "accuracy": "N/A"}, "description": "CamemBERT is a state-of-the-art language model for French based on the RoBERTa model. It is available on Hugging Face in 6 different versions with varying number of parameters, amount of pretraining data, and pretraining data source domains. It can be used for Fill-Mask tasks.", "model_name": "camembert-base"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Speech Enhancement", "api_name": "sepformer-wham-enhancement", "api_call": "separator.from_hparams(source='speechbrain/sepformer-wham-enhancement', savedir='pretrained_models/sepformer-wham-enhancement')", "performance": {"dataset": "WHAM!", "accuracy": "14.35 dB SI-SNR"}, "description": "This repository provides all the necessary tools to perform speech enhancement (denoising) with a SepFormer model, implemented with SpeechBrain, and pretrained on WHAM! dataset with 8k sampling frequency, which is basically a version of WSJ0-Mix dataset with environmental noise and reverberation in 8k.", "model_name": "speechbrain/sepformer-wham-enhancement"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Sentiment Inferencing for stock-related comments", "api_name": "zhayunduo/roberta-base-stocktwits-finetuned", "api_call": "RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')", "performance": {"dataset": "stocktwits", "accuracy": 0.9343}, "description": "This model is fine-tuned with roberta-base model on 3,200,000 comments from stocktwits, with the user-labeled tags 'Bullish' or 'Bearish'.", "model_name": "zhayunduo/roberta-base-stocktwits-finetuned"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "LayoutLMX_pt_question_answer_ocrazure_correct_V16_07_04_2023", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V15_30_03_2023')", "performance": {"dataset": "", "accuracy": ""}, "description": "A LayoutLMv2 model for document question answering.", "model_name": "L-oenai/LayoutLMX_pt_question_answer_ocrazure_correct_V15_30_03_2023"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "superb/hubert-large-superb-sid", "api_call": "pipeline('audio-classification', model='superb/hubert-large-superb-sid')", "performance": {"dataset": "VoxCeleb1", "accuracy": 0.9035000000000001}, "description": "Hubert-Large for Speaker Identification. This model is pretrained on 16kHz sampled speech audio and should be used with speech input also sampled at 16Khz. It is used for the SUPERB Speaker Identification task and can classify each utterance for its speaker identity as a multi-class classification.", "model_name": "superb/hubert-large-superb-sid"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "microsoft/DialoGPT-medium", "api_call": "AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-medium')", "performance": {"dataset": "Reddit", "accuracy": "Comparable to human response quality under a single-turn conversation Turing test"}, "description": "DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. The model is trained on 147M multi-turn dialogue from Reddit discussion thread.", "model_name": "microsoft/DialoGPT-medium"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/maskformer-swin-large-ade", "api_call": "MaskFormerForInstanceSegmentation.from_pretrained('facebook/maskformer-swin-large-ade')", "performance": {"dataset": "ADE20k", "accuracy": "Not provided"}, "description": "MaskFormer model trained on ADE20k semantic segmentation (large-sized version, Swin backbone). It was introduced in the paper Per-Pixel Classification is Not All You Need for Semantic Segmentation and first released in this repository. This model addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. Hence, all 3 tasks are treated as if they were instance segmentation.", "model_name": "facebook/maskformer-swin-large-ade"}
{"domain": "Tabular Tabular Classification", "framework": "Scikit-learn", "functionality": "Binary Classification", "api_name": "danupurnomo/dummy-titanic", "api_call": "load_model(cached_download(hf_hub_url('danupurnomo/dummy-titanic', 'titanic_model.h5')))", "performance": {"dataset": "Titanic", "accuracy": "Not provided"}, "description": "This model is a binary classifier for predicting whether a passenger on the Titanic survived or not, based on features such as passenger class, age, sex, fare, and more.", "model_name": "danupurnomo/dummy-titanic"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "openai/clip-vit-base-patch16", "api_call": "CLIPModel.from_pretrained('openai/clip-vit-base-patch16')", "performance": {"dataset": ["Food101", "CIFAR10", "CIFAR100", "Birdsnap", "SUN397", "Stanford Cars", "FGVC Aircraft", "VOC2007", "DTD", "Oxford-IIIT Pet dataset", "Caltech101", "Flowers102", "MNIST", "SVHN", "IIIT5K", "Hateful Memes", "SST-2", "UCF101", "Kinetics700", "Country211", "CLEVR Counting", "KITTI Distance", "STL-10", "RareAct", "Flickr30", "MSCOCO", "ImageNet", "ImageNet-A", "ImageNet-R", "ImageNet Sketch", "ObjectNet (ImageNet Overlap)", "Youtube-BB", "ImageNet-Vid"], "accuracy": "varies depending on the dataset"}, "description": "The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner.", "model_name": "openai/clip-vit-base-patch16"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-1.3b", "api_call": "pipeline('text-generation', model='facebook/opt-1.3b')", "performance": {"dataset": "BookCorpus, CC-Stories, The Pile, Pushshift.io Reddit, CCNewsV2", "accuracy": "Not provided"}, "description": "OPT (Open Pre-trained Transformers) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, trained to roughly match the performance and sizes of the GPT-3 class of models. It can be used for prompting for evaluation of downstream tasks as well as text generation.", "model_name": "facebook/opt-1.3b"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mazkooleg/0-9up-wavlm-base-plus-ft", "api_call": "pipeline('audio-classification', model='mazkooleg/0-9up-wavlm-base-plus-ft')", "performance": {"dataset": "mazkooleg/0-9up_google_speech_commands_augmented_raw", "accuracy": 0.9973000000000001}, "description": "This model is a fine-tuned version of microsoft/wavlm-base-plus on the None dataset. It achieves the following results on the evaluation set: Loss: 0.0093, Accuracy: 0.9973.", "model_name": "mazkooleg/0-9up-wavlm-base-plus-ft"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch", "api_call": "SpeechRecognitionModel('jonatasgrosman/wav2vec2-large-xlsr-53-dutch')", "performance": {"dataset": "Common Voice nl", "accuracy": {"Test WER": 15.72, "Test CER": 5.35, "Test WER (+LM)": 12.84, "Test CER (+LM)": 4.64}}, "description": "Fine-tuned XLSR-53 large model for speech recognition in Dutch. Fine-tuned on Dutch using the train and validation splits of Common Voice 6.1 and CSS10.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-medium-finetuned-wtq", "api_call": "pipeline('table-question-answering', model='google/tapas-medium-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": 0.4324}, "description": "TAPAS medium model fine-tuned on WikiTable Questions (WTQ). This model is pretrained on a large corpus of English data from Wikipedia and is used for answering questions related to a table.", "model_name": "google/tapas-medium-finetuned-wtq"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "hyunwoongko/blenderbot-9B", "api_call": "pipeline('conversational', model='hyunwoongko/blenderbot-9B')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not provided"}, "description": "Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that scaling neural models in the number of parameters and the size of the data they are trained on gives improved results, we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to their partners, both asking and answering questions, and displaying knowledge, empathy and personality appropriately, depending on the situation. We show that large scale models can learn these skills when given appropriate training data and choice of generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter neural models, and make our models and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing failure cases of our models.", "model_name": "hyunwoongko/blenderbot-9B"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "CompVis/ldm-celebahq-256", "api_call": "DiffusionPipeline.from_pretrained('CompVis/ldm-celebahq-256')", "performance": {"dataset": "CelebA-HQ", "accuracy": "N/A"}, "description": "Latent Diffusion Models (LDMs) achieve state-of-the-art synthesis results on image data and beyond by decomposing the image formation process into a sequential application of denoising autoencoders. LDMs enable high-resolution synthesis, semantic scene synthesis, super-resolution, and image inpainting while significantly reducing computational requirements compared to pixel-based DMs.", "model_name": "CompVis/ldm-celebahq-256"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "speech-to-speech-translation", "api_name": "xm_transformer_s2ut_en-hk", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/xm_transformer_s2ut_en-hk')", "performance": {"dataset": "MuST-C", "accuracy": "Not specified"}, "description": "Speech-to-speech translation model with single-pass decoder (S2UT) from fairseq: English-Hokkien. Trained with supervised data in TED domain, and weakly supervised data in TED and Audiobook domain.", "model_name": "facebook/xm_transformer_s2ut_en-hk"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Generation", "api_name": "stabilityai/stable-diffusion-2-1-base", "api_call": "StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-1-base', scheduler=EulerDiscreteScheduler.from_pretrained(stabilityai/stable-diffusion-2-1-base, subfolder=scheduler), torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion v2-1-base is a diffusion-based text-to-image generation model that can generate and modify images based on text prompts. It is a Latent Diffusion Model that uses a fixed, pretrained text encoder (OpenCLIP-ViT/H). It is intended for research purposes only and can be used in areas such as safe deployment of models, understanding limitations and biases of generative models, generation of artworks, and research on generative models.", "model_name": "stabilityai/stable-diffusion-2-1-base"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "Narsil/deberta-large-mnli-zero-cls", "api_call": "DebertaModel.from_pretrained('Narsil/deberta-large-mnli-zero-cls')", "performance": {"dataset": {"SQuAD 1.1": {"F1": 95.5, "EM": 90.1}, "SQuAD 2.0": {"F1": 90.7, "EM": 88.0}, "MNLI-m/mm": {"Accuracy": 91.3}, "SST-2": {"Accuracy": 96.5}, "QNLI": {"Accuracy": 95.3}, "CoLA": {"MCC": 69.5}, "RTE": {"Accuracy": 91.0}, "MRPC": {"Accuracy": 92.6}, "QQP": {}, "STS-B": {"P/S": 92.8}}}, "description": "DeBERTa improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. It outperforms BERT and RoBERTa on the majority of NLU tasks with 80GB training data. This is the DeBERTa large model fine-tuned with MNLI task.", "model_name": "Narsil/deberta-large-mnli-zero-cls"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "PyTorch Transformers", "functionality": "Feature Extraction", "api_name": "kobart-base-v2", "api_call": "BartModel.from_pretrained('gogamza/kobart-base-v2')", "performance": {"dataset": "NSMC", "accuracy": 0.901}, "description": "KoBART is a Korean encoder-decoder language model trained on over 40GB of Korean text using the BART architecture. It can be used for feature extraction and has been trained on a variety of data sources, including Korean Wiki, news, books, and more.", "model_name": "gogamza/kobart-base-v2"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-hr-finetuned-k600", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-hr-finetuned-k600')", "performance": {"dataset": "Kinetics-600", "accuracy": "Not provided"}, "description": "TimeSformer model pre-trained on Kinetics-600. It was introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al. and first released in this repository. The model can be used for video classification into one of the 600 possible Kinetics-600 labels.", "model_name": "facebook/timesformer-hr-finetuned-k600"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft')", "performance": {"dataset": "ImageNet-1k", "accuracy": "75.9-76.9%"}, "description": "A series of CLIP ConvNeXt-Large models trained on the LAION-2B (english) subset of LAION-5B using OpenCLIP. The models achieve between 75.9 and 76.9 top-1 zero-shot accuracy on ImageNet-1k.", "model_name": "laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-13b", "api_call": "AutoModelForCausalLM.from_pretrained('facebook/opt-13b')", "performance": {"dataset": "GPT-3", "accuracy": "roughly match the performance and sizes of the GPT-3 class of models"}, "description": "OPT (Open Pre-trained Transformers) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters. The models are trained to match the performance and sizes of the GPT-3 class of models. The primary goal is to enable reproducible and responsible research at scale and to bring more voices to the table in studying the impact of large language models. OPT-13B is a 13-billion-parameter model trained predominantly with English text, but a small amount of non-English data is still present within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM) objective.", "model_name": "facebook/opt-13b"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/tts_transformer-zh-cv7_css10", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/tts_transformer-zh-cv7_css10', arg_overrides={'vocoder': 'hifigan', 'fp16': False})", "performance": {"dataset": "common_voice", "accuracy": "Not provided"}, "description": "Transformer text-to-speech model from fairseq S^2. Simplified Chinese, Single-speaker female voice, Pre-trained on Common Voice v7, fine-tuned on CSS10.", "model_name": "facebook/tts_transformer-zh-cv7_css10"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "it5-base-news-summarization", "api_call": "pipeline('summarization', model='it5/it5-base-news-summarization')", "performance": {"dataset": "NewsSum-IT", "accuracy": {"Rouge1": 0.339, "Rouge2": 0.16, "RougeL": 0.263}}, "description": "IT5 Base model fine-tuned on news summarization on the Fanpage and Il Post corpora for Italian Language Understanding and Generation.", "model_name": "it5/it5-base-news-summarization"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8s-csgo-player-detection", "api_call": "YOLO('keremberke/yolov8s-csgo-player-detection')", "performance": {"dataset": "csgo-object-detection", "accuracy": 0.886}, "description": "A YOLOv8 model for detecting Counter-Strike: Global Offensive (CS:GO) players. Supports the labels ['ct', 'cthead', 't', 'thead'].", "model_name": "keremberke/yolov8s-csgo-player-detection"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Inference", "api_name": "google/ncsnpp-ffhq-1024", "api_call": "DiffusionPipeline.from_pretrained('google/ncsnpp-ffhq-1024')", "performance": {"dataset": "CIFAR-10", "accuracy": {"Inception_score": 9.89, "FID": 2.2, "likelihood": 2.99}}, "description": "Score-Based Generative Modeling through Stochastic Differential Equations (SDE) for unconditional image generation. Achieves record-breaking performance on CIFAR-10 and demonstrates high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.", "model_name": "google/ncsnpp-ffhq-1024"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mgp-str", "api_call": "MgpstrForSceneTextRecognition.from_pretrained('alibaba-damo/mgp-str-base')", "performance": {"dataset": "MJSynth and SynthText", "accuracy": null}, "description": "MGP-STR is a pure vision Scene Text Recognition (STR) model, consisting of ViT and specially designed A^3 modules. It is trained on MJSynth and SynthText datasets and can be used for optical character recognition (OCR) on text images.", "model_name": "alibaba-damo/mgp-str-base"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mazkooleg/0-9up-ast-ft", "api_call": "pipeline('audio-classification', model= 'MIT/ast-finetuned-speech-commands-v2')", "performance": {"dataset": "mazkooleg/0-9up_google_speech_commands_augmented_raw", "accuracy": 0.9979}, "description": "This model is a fine-tuned version of MIT/ast-finetuned-speech-commands-v2 on the None dataset. It achieves the following results on the evaluation set: Loss: 0.0210, Accuracy: 0.9979", "model_name": "MIT/ast-finetuned-speech-commands-v2"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "WiNE-iNEFF/Minecraft-Skin-Diffusion-V2", "api_call": "DDPMPipeline.from_pretrained('WiNE-iNEFF/Minecraft-Skin-Diffusion-V2')", "performance": {"dataset": null, "accuracy": null}, "description": "An unconditional image generation model for generating Minecraft skin images using the diffusion model.", "model_name": "WiNE-iNEFF/Minecraft-Skin-Diffusion-V2"}
{"domain": "Tabular Tabular Classification", "framework": "Scikit-learn", "functionality": "Joblib", "api_name": "julien-c/skops-digits", "api_call": "load('path_to_folder/sklearn_model.joblib')", "performance": {"dataset": null, "accuracy": null}, "description": "A tabular classification model using the Scikit-learn framework and Joblib functionality. The model is trained with various hyperparameters and can be used for classification tasks.", "model_name": "path_to_folder/sklearn_model.joblib"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-plane-detection", "api_call": "YOLO('keremberke/yolov8m-plane-detection')", "performance": {"dataset": "plane-detection", "accuracy": "0.995"}, "description": "A YOLOv8 model for plane detection trained on the keremberke/plane-detection dataset. The model is capable of detecting planes in images with high accuracy.", "model_name": "keremberke/yolov8m-plane-detection"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "openai-gpt", "api_call": "pipeline('text-generation', model='openai-gpt')", "performance": {"dataset": [{"name": "SNLI", "accuracy": 89.9}, {"name": "MNLI Matched", "accuracy": 82.1}, {"name": "MNLI Mismatched", "accuracy": 81.4}, {"name": "SciTail", "accuracy": 88.3}, {"name": "QNLI", "accuracy": 88.1}, {"name": "RTE", "accuracy": 56.0}, {"name": "STS-B", "accuracy": 82.0}, {"name": "QQP", "accuracy": 70.3}, {"name": "MPRC", "accuracy": 82.3}, {"name": "RACE", "accuracy": 59.0}, {"name": "ROCStories", "accuracy": 86.5}, {"name": "COPA", "accuracy": 78.6}, {"name": "SST-2", "accuracy": 91.3}, {"name": "CoLA", "accuracy": 45.4}, {"name": "GLUE", "accuracy": 72.8}]}, "description": "openai-gpt is a transformer-based language model created and released by OpenAI. The model is a causal (unidirectional) transformer pre-trained using language modeling on a large corpus with long-range dependencies.", "model_name": "openai-gpt"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "openai/clip-vit-large-patch14", "api_call": "CLIPModel.from_pretrained('openai/clip-vit-large-patch14')", "performance": {"dataset": ["Food101", "CIFAR10", "CIFAR100", "Birdsnap", "SUN397", "Stanford Cars", "FGVC Aircraft", "VOC2007", "DTD", "Oxford-IIIT Pet dataset", "Caltech101", "Flowers102", "MNIST", "SVHN", "IIIT5K", "Hateful Memes", "SST-2", "UCF101", "Kinetics700", "Country211", "CLEVR Counting", "KITTI Distance", "STL-10", "RareAct", "Flickr30", "MSCOCO", "ImageNet", "ImageNet-A", "ImageNet-R", "ImageNet Sketch", "ObjectNet (ImageNet Overlap)", "Youtube-BB", "ImageNet-Vid"], "accuracy": "varies depending on the dataset"}, "description": "The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner.", "model_name": "openai/clip-vit-large-patch14"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Age Classification", "api_name": "nateraw/vit-age-classifier", "api_call": "ViTForImageClassification.from_pretrained('nateraw/vit-age-classifier')", "performance": {"dataset": "fairface", "accuracy": null}, "description": "A vision transformer finetuned to classify the age of a given person's face.", "model_name": "nateraw/vit-age-classifier"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Automatic Speech Recognition and Speech Translation", "api_name": "openai/whisper-large", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-large')", "performance": {"dataset": [{"name": "LibriSpeech (clean)", "accuracy": 3.0}, {"name": "LibriSpeech (other)", "accuracy": 5.4}, {"name": "Common Voice 11.0", "accuracy": 54.8}]}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning.", "model_name": "openai/whisper-large"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face", "functionality": "Visual Question Answering", "api_name": "JosephusCheung/GuanacoVQA", "api_call": "pipeline('visual-question-answering', model='GuanacoVQA').", "performance": {"dataset": "JosephusCheung/GuanacoVQADataset", "accuracy": "N/A"}, "description": "A multilingual Visual Question Answering model supporting English, Chinese, Japanese, and German languages. It requires the combined use of the Guanaco 7B LLM model and is based on the implementation of MiniGPT-4.", "model_name": "GuanacoVQA"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image Variations", "api_name": "lambdalabs/sd-image-variations-diffusers", "api_call": "StableDiffusionImageVariationPipeline.from_pretrained('lambdalabs/sd-image-variations-diffusers', revision='v2.0')", "performance": {"dataset": "ChristophSchuhmann/improved_aesthetics_6plus", "accuracy": "N/A"}, "description": "This version of Stable Diffusion has been fine tuned from CompVis/stable-diffusion-v1-4-original to accept CLIP image embedding rather than text embeddings. This allows the creation of image variations similar to DALLE-2 using Stable Diffusion.", "model_name": "lambdalabs/sd-image-variations-diffusers"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Token Classification", "api_name": "vblagoje/bert-english-uncased-finetuned-pos", "api_call": "AutoModelForTokenClassification.from_pretrained('vblagoje/bert-english-uncased-finetuned-pos')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A BERT model fine-tuned for Part-of-Speech (POS) tagging in English text.", "model_name": "vblagoje/bert-english-uncased-finetuned-pos"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/dragon-plus-context-encoder", "api_call": "AutoModel.from_pretrained('facebook/dragon-plus-context-encoder')", "performance": {"dataset": "MS MARCO", "accuracy": 39.0}, "description": "DRAGON+ is a BERT-base sized dense retriever initialized from RetroMAE and further trained on the data augmented from MS MARCO corpus, following the approach described in How to Train Your DRAGON: Diverse Augmentation Towards Generalizable Dense Retrieval. The associated GitHub repository is available here https://github.com/facebookresearch/dpr-scale/tree/main/dragon. We use asymmetric dual encoder, with two distinctly parameterized encoders.", "model_name": "facebook/dragon-plus-context-encoder"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Paraphrase-based utterance augmentation", "api_name": "prithivida/parrot_fluency_model", "api_call": "pipeline('text-classification', model='prithivida/parrot_fluency_model')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Parrot is a paraphrase-based utterance augmentation framework purpose-built to accelerate training NLU models. A paraphrase framework is more than just a paraphrasing model.", "model_name": "prithivida/parrot_fluency_model"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221121-113853", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-nyu-finetuned-diode-221121-113853')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.33840000000000003, "Mae": 0.27390000000000003, "Rmse": 0.39590000000000003, "Abs Rel": 0.323, "Log Mae": 0.1148, "Log Rmse": 0.1651, "Delta1": 0.5576, "Delta2": 0.8345, "Delta3": 0.9398000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221121-113853"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "DialoGPT-medium-PALPATINE2", "api_call": "pipeline('text-generation', model='Filosofas/DialoGPT-medium-PALPATINE2')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A DialoGPT model trained for generating human-like conversational responses.", "model_name": "Filosofas/DialoGPT-medium-PALPATINE2"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video", "api_name": "ImRma/Brucelee", "api_call": "pipeline('text-to-video', model='ImRma/Brucelee')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Hugging Face model for converting Persian and English text into video.", "model_name": "ImRma/Brucelee"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/xclip-base-patch32", "api_call": "XClipModel.from_pretrained('microsoft/xclip-base-patch32')", "performance": {"dataset": "Kinetics 400", "accuracy": {"top-1": 80.4, "top-5": 95.0}}, "description": "X-CLIP is a minimal extension of CLIP for general video-language understanding. The model is trained in a contrastive way on (video, text) pairs. This allows the model to be used for tasks like zero-shot, few-shot or fully supervised video classification and video-text retrieval.", "model_name": "microsoft/xclip-base-patch32"}
{"domain": "Reinforcement Learning Robotics", "framework": "Hugging Face", "functionality": "6D grasping", "api_name": "camusean/grasp_diffusion", "api_call": "AutoModel.from_pretrained('camusean/grasp_diffusion')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Trained Models for Grasp SE(3) DiffusionFields. Check SE(3)-DiffusionFields: Learning smooth cost functions for joint grasp and motion optimization through diffusion for additional details.", "model_name": "camusean/grasp_diffusion"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "gsdf/Counterfeit-V2.5", "api_call": "pipeline('text-to-image', model='gsdf/Counterfeit-V2.5')", "performance": {"dataset": "EasyNegative", "accuracy": "Not provided"}, "description": "Counterfeit-V2.5 is a text-to-image model that generates anime-style images based on text prompts. It has been updated for ease of use and can be used with negative prompts to create high-quality images.", "model_name": "gsdf/Counterfeit-V2.5"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "dreamlike-art/dreamlike-anime-1.0", "api_call": "StableDiffusionPipeline.from_pretrained('dreamlike-art/dreamlike-anime-1.0', torch_dtype=torch.float16)(prompt, negative_prompt=negative_prompt)", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Dreamlike Anime 1.0 is a high quality anime model, made by dreamlike.art. It can be used to generate anime-style images based on text prompts. The model is trained on 768x768px images and works best with prompts that include 'photo anime, masterpiece, high quality, absurdres'. It can be used with the Stable Diffusion Pipeline from the diffusers library.", "model_name": "dreamlike-art/dreamlike-anime-1.0"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset", "api_call": "AutoModelForVideoClassification.from_pretrained('sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset')", "performance": {"dataset": "unknown", "accuracy": 1.0}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base-finetuned-kinetics on an unknown dataset.", "model_name": "sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Conversational", "api_name": "facebook/blenderbot-400M-distill", "api_call": "BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not specified"}, "description": "BlenderBot-400M-distill is a distilled version of the BlenderBot model, trained on the Blended Skill Talk dataset. It is designed for open-domain chatbot tasks and can generate text-to-text responses in a conversational manner. The model is based on the Transformers library and can be used with PyTorch, TensorFlow, and JAX.", "model_name": "facebook/blenderbot-400M-distill"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Image-to-Text", "api_name": "ydshieh/vit-gpt2-coco-en", "api_call": "VisionEncoderDecoderModel.from_pretrained('ydshieh/vit-gpt2-coco-en')", "performance": {"dataset": "COCO", "accuracy": "Not specified"}, "description": "A proof-of-concept model for the Hugging Face FlaxVisionEncoderDecoder Framework that produces reasonable image captioning results.", "model_name": "ydshieh/vit-gpt2-coco-en"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "dreamlike-art/dreamlike-photoreal-2.0", "api_call": "StableDiffusionPipeline.from_pretrained('dreamlike-art/dreamlike-photoreal-2.0', torch_dtype=torch.float16)(prompt).images[0]", "performance": {"dataset": "Stable Diffusion 1.5", "accuracy": "Not specified"}, "description": "Dreamlike Photoreal 2.0 is a photorealistic model based on Stable Diffusion 1.5, made by dreamlike.art. It can be used to generate photorealistic images from text prompts.", "model_name": "dreamlike-art/dreamlike-photoreal-2.0"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "Lykon/DreamShaper", "api_call": "pipeline('text-to-image', model=Lykon/DreamShaper)", "performance": {"dataset": "", "accuracy": ""}, "description": "Dream Shaper is a text-to-image model that generates artistic images based on the given input text. Read more about this model here: https://civitai.com/models/4384/dreamshaper", "model_name": "Lykon/DreamShaper"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "bert-large-uncased-whole-word-masking-squad2", "api_call": "pipeline('question-answering', model=AutoModel.from_pretrained('deepset/bert-large-uncased-whole-word-masking-squad2'), tokenizer=AutoTokenizer.from_pretrained('deepset/bert-large-uncased-whole-word-masking-squad2'))", "performance": {"dataset": "squad_v2", "accuracy": {"Exact Match": 80.885, "F1": 83.876}}, "description": "This is a bert-large model, fine-tuned using the SQuAD2.0 dataset for the task of question answering. It is designed for extractive question answering and supports English language.", "model_name": "deepset/bert-large-uncased-whole-word-masking-squad2"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "DCUNet_Libri1Mix_enhsingle_16k", "api_call": "BaseModel.from_pretrained('JorisCos/DCUNet_Libri1Mix_enhsingle_16k')", "performance": {"dataset": "Libri1Mix", "accuracy": {"si_sdr": 13.1540353916, "si_sdr_imp": 9.7042540858, "sdr": 13.5680588731, "sdr_imp": 10.0653960739, "sar": 13.5680588731, "sar_imp": 10.0653960739, "stoi": 0.9199373340000001, "stoi_imp": 0.12401751050000001}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the enh_single task of the Libri1Mix dataset.", "model_name": "JorisCos/DCUNet_Libri1Mix_enhsingle_16k"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "lidiya/bart-large-xsum-samsum", "api_call": "pipeline('summarization', model='lidiya/bart-large-xsum-samsum')", "performance": {"dataset": "SAMSum Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization", "accuracy": {"rouge1": 53.306, "rouge2": 28.355, "rougeL": 44.095}}, "description": "This model was obtained by fine-tuning facebook/bart-large-xsum on Samsum dataset.", "model_name": "lidiya/bart-large-xsum-samsum"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "videomae-base-short-ssv2", "api_call": "VideoMAEForPreTraining.from_pretrained('MCG-NJU/videomae-base-short-ssv2')", "performance": {"dataset": "Something-Something-v2", "accuracy": "N/A"}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches. Videos are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds fixed sinus/cosinus position embeddings before feeding the sequence to the layers of the Transformer encoder. By pre-training the model, it learns an inner representation of videos that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled videos for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire video.", "model_name": "MCG-NJU/videomae-base-short-ssv2"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech to Text", "api_name": "facebook/s2t-medium-librispeech-asr", "api_call": "Speech2TextForConditionalGeneration.from_pretrained('facebook/s2t-medium-librispeech-asr')", "performance": {"dataset": "LibriSpeech", "accuracy": {"clean": 3.5, "other": 7.8}}, "description": "s2t-medium-librispeech-asr is a Speech to Text Transformer (S2T) model trained for automatic speech recognition (ASR). The S2T model was proposed in this paper and released in this repository.", "model_name": "facebook/s2t-medium-librispeech-asr"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "kykim/bertshared-kor-base", "api_call": "EncoderDecoderModel.from_pretrained('kykim/bertshared-kor-base')", "performance": {"dataset": "70GB Korean text dataset", "accuracy": "42000 lower-cased subwords"}, "description": "Bert base model for Korean, trained on a 70GB Korean text dataset and 42000 lower-cased subwords. Can be used for Text2Text Generation tasks.", "model_name": "kykim/bertshared-kor-base"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/deberta-v3-large-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/deberta-v3-large-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact": 87.6105449339, "f1": 90.7530700887}}, "description": "This is the deberta-v3-large model, fine-tuned using the SQuAD2.0 dataset. It's been trained on question-answer pairs, including unanswerable questions, for the task of Question Answering.", "model_name": "deepset/deberta-v3-large-squad2"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "layoutlmv2-base-uncased_finetuned_docvqa", "api_call": "pipeline('question-answering', model='Sayantan1993/layoutlmv2-base-uncased_finetuned_docvqa')", "performance": {"dataset": "", "accuracy": ""}, "description": "A model for document question answering, fine-tuned on the DocVQA dataset using LayoutLMv2-base-uncased.", "model_name": "Sayantan1993/layoutlmv2-base-uncased_finetuned_docvqa"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "google/ncsnpp-church-256", "api_call": "DiffusionPipeline.from_pretrained('google/ncsnpp-church-256')", "performance": {"dataset": "CIFAR-10", "accuracy": {"Inception_score": 9.89, "FID": 2.2, "likelihood": 2.99}}, "description": "Score-Based Generative Modeling through Stochastic Differential Equations (SDE) for unconditional image generation. This model achieves record-breaking performance on CIFAR-10 and can generate high fidelity images of size 1024 x 1024.", "model_name": "google/ncsnpp-church-256"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "TehVenom/PPO_Pygway-V8p4_Dev-6b", "api_call": "pipeline('text-generation', model='TehVenom/PPO_Pygway-V8p4_Dev-6b')", "performance": {"dataset": "", "accuracy": ""}, "description": "TODO card. Mix of (GPT-J-6B-Janeway + PPO_HH_GPT-J) + Pygmalion-6b-DEV (V8 / Part 4). At a ratio of GPT-J-6B-Janeway - 20%, PPO_HH_GPT-J - 20%, Pygmalion-6b DEV (V8 / Part 4) - 60%.", "model_name": "TehVenom/PPO_Pygway-V8p4_Dev-6b"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Transformers", "functionality": "Transcription", "api_name": "facebook/wav2vec2-base-960h", "api_call": "Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')", "performance": {"dataset": "LibriSpeech", "accuracy": {"clean": 3.4, "other": 8.6}}, "description": "Facebook's Wav2Vec2 base model pretrained and fine-tuned on 960 hours of Librispeech on 16kHz sampled speech audio. It is designed for automatic speech recognition and can transcribe audio files.", "model_name": "facebook/wav2vec2-base-960h"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "typeform/squeezebert-mnli", "api_call": "AutoModel.from_pretrained('typeform/squeezebert-mnli')", "performance": {"dataset": "mulit_nli", "accuracy": "not provided"}, "description": "SqueezeBERT is a transformer model designed for efficient inference on edge devices. This specific model, typeform/squeezebert-mnli, is fine-tuned on the MultiNLI dataset for zero-shot classification tasks.", "model_name": "typeform/squeezebert-mnli"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Text Classification", "api_name": "shahrukhx01/question-vs-statement-classifier", "api_call": "AutoModelForSequenceClassification.from_pretrained('shahrukhx01/question-vs-statement-classifier')", "performance": {"dataset": "Haystack", "accuracy": "Not provided"}, "description": "Trained to add the feature for classifying queries between Question Query vs Statement Query using classification in Haystack", "model_name": "shahrukhx01/question-vs-statement-classifier"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "bigscience/bloomz-560m", "api_call": "AutoModelForCausalLM.from_pretrained('bigscience/bloomz-560m')", "performance": {"dataset": "bigscience/xP3", "accuracy": {"Winogrande XL (xl) validation set": 52.41, "XWinograd (en) test set": 51.01, "XWinograd (fr) test set": 51.81, "XWinograd (jp) test set": 52.03, "XWinograd (pt) test set": 53.99, "XWinograd (ru) test set": 53.97, "XWinograd (zh) test set": 54.76, "ANLI (r1) validation set": 33.4, "ANLI (r2) validation set": 33.4, "ANLI (r3) validation set": 33.5}}, "description": "BLOOMZ & mT0 are a family of models capable of following human instructions in dozens of languages zero-shot. Finetuned on the crosslingual task mixture (xP3), these models can generalize to unseen tasks & languages. Useful for tasks expressed in natural language, such as translation, summarization, and question answering.", "model_name": "bigscience/bloomz-560m"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "swin2SR-classical-sr-x4-64", "api_call": "pipeline('image-super-resolution', model='caidas/swin2SR-classical-sr-x4-64')", "performance": {"dataset": "", "accuracy": ""}, "description": "Swin2SR model that upscales images x4. It was introduced in the paper Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration by Conde et al. and first released in this repository. This model is intended for image super resolution.", "model_name": "caidas/swin2SR-classical-sr-x4-64"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "ocariz/universe_1400", "api_call": "DDPMPipeline.from_pretrained('ocariz/universe_1400')", "performance": {"dataset": "", "accuracy": ""}, "description": "This model is a diffusion model for unconditional image generation of the universe trained for 1400 epochs.", "model_name": "ocariz/universe_1400"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "DataIntelligenceTeam/eurocorpV4", "api_call": "AutoModelForTokenClassification.from_pretrained('DataIntelligenceTeam/eurocorpV4')", "performance": {"dataset": "sroie", "accuracy": 0.982}, "description": "This model is a fine-tuned version of microsoft/layoutlmv3-large on the sroie dataset. It achieves the following results on the evaluation set: Loss: 0.1239, Precision: 0.9548, Recall: 0.9602, F1: 0.9575, Accuracy: 0.9819", "model_name": "DataIntelligenceTeam/eurocorpV4"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "dsba-lab/koreapas-finetuned-korwikitq", "api_call": "pipeline('table-question-answering', model='dsba-lab/koreapas-finetuned-korwikitq')", "performance": {"dataset": "korwikitq", "accuracy": null}, "description": "A Korean Table Question Answering model finetuned on the korwikitq dataset.", "model_name": "dsba-lab/koreapas-finetuned-korwikitq"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Audio Classification", "api_name": "distil-ast-audioset", "api_call": "AutoModelForSequenceClassification.from_pretrained('bookbot/distil-ast-audioset')", "performance": {"dataset": "AudioSet", "accuracy": 0.0714}, "description": "Distil Audio Spectrogram Transformer AudioSet is an audio classification model based on the Audio Spectrogram Transformer architecture. This model is a distilled version of MIT/ast-finetuned-audioset-10-10-0.4593 on the AudioSet dataset.", "model_name": "bookbot/distil-ast-audioset"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "decapoda-research/llama-13b-hf", "api_call": "pipeline('text-generation', model='decapoda-research/llama-13b-hf')", "performance": {"dataset": [{"name": "BoolQ", "accuracy": "85.3"}, {"name": "PIQA", "accuracy": "82.8"}, {"name": "SIQA", "accuracy": "52.3"}, {"name": "HellaSwag", "accuracy": "84.2"}, {"name": "WinoGrande", "accuracy": "77"}, {"name": "ARC-e", "accuracy": "81.5"}, {"name": "ARC-c", "accuracy": "56"}, {"name": "OBQACOPA", "accuracy": "60.2"}]}, "description": "LLaMA-13B is an auto-regressive language model based on the transformer architecture developed by the FAIR team of Meta AI. It is designed for research purposes, such as question answering, natural language understanding, and reading comprehension. The model has been trained on a variety of sources, including web data, GitHub, Wikipedia, and books in 20 languages. It has been evaluated on several benchmarks, including BoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC, and OpenBookQA.", "model_name": "decapoda-research/llama-13b-hf"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Semantic Segmentation", "api_name": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024", "api_call": "SegformerForSemanticSegmentation.from_pretrained('nvidia/segformer-b5-finetuned-cityscapes-1024-1024')", "performance": {"dataset": "CityScapes", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on CityScapes at resolution 1024x1024. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository.", "model_name": "nvidia/segformer-b5-finetuned-cityscapes-1024-1024"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Denoising Diffusion Probabilistic Models (DDPM)", "api_name": "google/ddpm-ema-celebahq-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-ema-celebahq-256')", "performance": {"dataset": {"CIFAR10": {"Inception_score": 9.46, "FID_score": 3.17}, "LSUN": {"sample_quality": "similar to ProgressiveGAN"}}}, "description": "High quality image synthesis using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics.", "model_name": "google/ddpm-ema-celebahq-256"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "openmmlab/upernet-convnext-small", "api_call": "UperNetModel.from_pretrained('openmmlab/upernet-convnext-small')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "UperNet framework for semantic segmentation, leveraging a ConvNeXt backbone. UperNet was introduced in the paper Unified Perceptual Parsing for Scene Understanding by Xiao et al. Combining UperNet with a ConvNeXt backbone was introduced in the paper A ConvNet for the 2020s.", "model_name": "openmmlab/upernet-convnext-small"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Entity Extraction", "api_name": "903929564", "api_call": "AutoModelForTokenClassification.from_pretrained('ismail-lucifer011/autotrain-job_all-903929564', use_auth_token=True)", "performance": {"dataset": "ismail-lucifer011/autotrain-data-job_all", "accuracy": 0.9989412010000001}, "description": "A Token Classification model trained using AutoTrain for Entity Extraction. The model is based on distilbert and achieves high accuracy, precision, recall, and F1 score.", "model_name": "ismail-lucifer011/autotrain-job_all-903929564"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "Zixtrauce/BDBot4Epoch", "api_call": "pipeline('text-generation', model='Zixtrauce/BDBot4Epoch')", "performance": {"dataset": "unknown", "accuracy": "unknown"}, "description": "BrandonBot4Epochs is a conversational model trained on the GPT-2 architecture for text generation. It can be used to generate responses in a chatbot-like interface.", "model_name": "Zixtrauce/BDBot4Epoch"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221215-095508", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221215-095508')", "performance": {"dataset": "DIODE", "accuracy": null}, "description": "A depth estimation model fine-tuned on the DIODE dataset using the GLPN model architecture.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221215-095508"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tiny-random-CLIPSegModel", "api_call": "pipeline('zero-shot-image-classification', model='hf-tiny-model-private/tiny-random-CLIPSegModel')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random CLIPSegModel for zero-shot image classification.", "model_name": "hf-tiny-model-private/tiny-random-CLIPSegModel"}
{"domain": "Reinforcement Learning", "framework": "ML-Agents", "functionality": "SoccerTwos", "api_name": "0xid/poca-SoccerTwos", "api_call": "mlagents-load-from-hf --repo-id='0xid/poca-SoccerTwos' --local-dir='./downloads'", "performance": {"dataset": "SoccerTwos", "accuracy": "N/A"}, "description": "A trained model of a poca agent playing SoccerTwos using the Unity ML-Agents Library.", "model_name": "0xid/poca-SoccerTwos"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Sentiment Classification", "api_name": "hackathon-pln-es/wav2vec2-base-finetuned-sentiment-classification-MESD", "api_call": "Wav2Vec2ForSequenceClassification.from_pretrained('hackathon-pln-es/wav2vec2-base-finetuned-sentiment-classification-MESD')", "performance": {"dataset": "MESD", "accuracy": 0.9308000000000001}, "description": "This model is a fine-tuned version of facebook/wav2vec2-base on the MESD dataset. It is trained to classify underlying sentiment of Spanish audio/speech.", "model_name": "hackathon-pln-es/wav2vec2-base-finetuned-sentiment-classification-MESD"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "facebook/blenderbot-3B", "api_call": "BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-3B')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not provided"}, "description": "BlenderBot-3B is a large-scale neural model designed for open-domain chatbot applications. It is trained on the blended_skill_talk dataset and can engage in multi-turn conversations, providing engaging talking points, asking and answering questions, and displaying knowledge, empathy, and personality. The model is available through the Hugging Face Transformers library.", "model_name": "facebook/blenderbot-3B"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Image Captioning", "api_name": "blip-image-captioning-base", "api_call": "BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')", "performance": {"dataset": "COCO", "accuracy": {"CIDEr": "+2.8%"}}, "description": "BLIP (Bootstrapping Language-Image Pre-training) is a new vision-language pre-training (VLP) framework that transfers flexibly to both vision-language understanding and generation tasks. It effectively utilizes noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. This model is pre-trained on the COCO dataset with a base architecture (ViT base backbone).", "model_name": "Salesforce/blip-image-captioning-base"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english-large", "api_call": "SequenceTagger.load('flair/ner-english-large')", "performance": {"dataset": "conll2003", "accuracy": "94.36"}, "description": "This is the large 4-class NER model for English that ships with Flair. It predicts 4 tags: PER (person name), LOC (location name), ORG (organization name), and MISC (other name). The model is based on document-level XLM-R embeddings and FLERT.", "model_name": "flair/ner-english-large"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "License Plate Detection", "api_name": "keremberke/yolov5m-license-plate", "api_call": "yolov5.load('keremberke/yolov5m-license-plate')", "performance": {"dataset": "keremberke/license-plate-object-detection", "accuracy": 0.988}, "description": "A YOLOv5 model for license plate detection trained on a custom dataset. The model can detect license plates in images with high accuracy.", "model_name": "keremberke/yolov5m-license-plate"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Voice Activity Detection, Overlapped Speech Detection, Resegmentation", "api_name": "pyannote/segmentation", "api_call": "VoiceActivityDetection(segmentation='pyannote/segmentation')", "performance": {"dataset": {"ami": {"accuracy": {"onset": 0.684, "offset": 0.577, "min_duration_on": 0.181, "min_duration_off": 0.037}}, "dihard": {"accuracy": {"onset": 0.767, "offset": 0.377, "min_duration_on": 0.136, "min_duration_off": 0.067}}, "voxconverse": {"accuracy": {"onset": 0.767, "offset": 0.713, "min_duration_on": 0.182, "min_duration_off": 0.501}}}}, "description": "Model from End-to-end speaker segmentation for overlap-aware resegmentation, by Herv\u00e9 Bredin and Antoine Laurent. It provides voice activity detection, overlapped speech detection, and resegmentation functionalities.", "model_name": "pyannote/segmentation"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "tiennvcs/layoutlmv2-large-uncased-finetuned-infovqa", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('tiennvcs/layoutlmv2-large-uncased-finetuned-infovqa')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 2.2207}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-large-uncased on an unknown dataset.", "model_name": "tiennvcs/layoutlmv2-large-uncased-finetuned-infovqa"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "google/pegasus-newsroom", "api_call": "pipeline('summarization', model='google/pegasus-newsroom')", "performance": {"dataset": "newsroom", "accuracy": "45.98/34.20/42.18"}, "description": "PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization by Jingqing Zhang, Yao Zhao, Mohammad Saleh, and Peter J. Liu. The model is trained on both C4 and HugeNews datasets and is designed for summarization tasks.", "model_name": "google/pegasus-newsroom"}
{"domain": "Multimodal Graph Machine Learning", "framework": "Hugging Face Transformers", "functionality": "Graph Classification", "api_name": "graphormer-base-pcqm4mv2", "api_call": "AutoModel.from_pretrained('clefourrier/graphormer-base-pcqm4mv2')", "performance": {"dataset": "PCQM4M-LSCv2", "accuracy": "Not provided"}, "description": "The Graphormer is a graph Transformer model, pretrained on PCQM4M-LSCv2. Developed by Microsoft, it is designed for graph classification tasks or graph representation tasks, such as molecule modeling.", "model_name": "clefourrier/graphormer-base-pcqm4mv2"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "baseline-trainer", "api_name": "merve/tips5wx_sbh5-tip-regression", "api_call": "joblib.load(hf_hub_download('merve/tips5wx_sbh5-tip-regression', 'sklearn_model.joblib'))", "performance": {"dataset": "tips5wx_sbh5", "r2": 0.38936299999999996, "neg_mean_squared_error": -1.092356}, "description": "Baseline Model trained on tips5wx_sbh5 to apply regression on tip", "model_name": "merve/tips5wx_sbh5-tip-regression"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "PyTorch Transformers", "functionality": "Table Question Answering", "api_name": "microsoft/tapex-large-finetuned-wikisql", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-large-finetuned-wikisql')", "performance": {"dataset": "wikisql", "accuracy": "N/A"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries. TAPEX is based on the BART architecture, the transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. This model is the tapex-base model fine-tuned on the WikiSQL dataset.", "model_name": "microsoft/tapex-large-finetuned-wikisql"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu-finetuned-diode-221116-104421", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221116-104421')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.37360000000000004, "Mae": 0.3079, "Rmse": 0.43210000000000004, "Abs Rel": 0.36660000000000004, "Log Mae": 0.1288, "Log Rmse": 0.1794, "Delta1": 0.4929, "Delta2": 0.7934, "Delta3": 0.9234}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221116-104421"}
{"domain": "Reinforcement Learning", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "edbeeching/decision-transformer-gym-hopper-expert", "api_call": "AutoModel.from_pretrained('edbeeching/decision-transformer-gym-hopper-expert')", "performance": {"dataset": "Gym Hopper environment", "accuracy": "Not provided"}, "description": "Decision Transformer model trained on expert trajectories sampled from the Gym Hopper environment", "model_name": "edbeeching/decision-transformer-gym-hopper-expert"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Automatic Speech Recognition", "api_name": "cpierse/wav2vec2-large-xlsr-53-esperanto", "api_call": "Wav2Vec2ForCTC.from_pretrained('cpierse/wav2vec2-large-xlsr-53-esperanto')", "performance": {"dataset": "common_voice", "accuracy": "12.31%"}, "description": "Fine-tuned facebook/wav2vec2-large-xlsr-53 on esperanto using the Common Voice dataset. When using this model, make sure that your speech input is sampled at 16kHz.", "model_name": "cpierse/wav2vec2-large-xlsr-53-esperanto"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "martinezomg/vit-base-patch16-224-diabetic-retinopathy", "api_call": "pipeline('image-classification', 'martinezomg/vit-base-patch16-224-diabetic-retinopathy')", "performance": {"dataset": "None", "accuracy": 0.7744000000000001}, "description": "This model is a fine-tuned version of google/vit-base-patch16-224 on the None dataset. It is designed for image classification tasks, specifically for diabetic retinopathy detection.", "model_name": "martinezomg/vit-base-patch16-224-diabetic-retinopathy"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "PyTorch Transformers", "functionality": "Table-based QA", "api_name": "neulab/omnitab-large-1024shot", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('neulab/omnitab-large-1024shot')", "performance": {"dataset": "wikitablequestions", "accuracy": "Not provided"}, "description": "OmniTab is a table-based QA model proposed in OmniTab: Pretraining with Natural and Synthetic Data for Few-shot Table-based Question Answering. neulab/omnitab-large-1024shot (based on BART architecture) is initialized with microsoft/tapex-large and continuously pretrained on natural and synthetic data (SQL2NL model trained in the 1024-shot setting).", "model_name": "neulab/omnitab-large-1024shot"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Cross-Encoder for Natural Language Inference", "api_name": "cross-encoder/nli-deberta-v3-base", "api_call": "CrossEncoder('cross-encoder/nli-deberta-v3-base')", "performance": {"dataset": {"SNLI-test": "92.38", "MNLI mismatched set": "90.04"}}, "description": "This model is based on microsoft/deberta-v3-base and was trained on the SNLI and MultiNLI datasets. For a given sentence pair, it will output three scores corresponding to the labels: contradiction, entailment, neutral.", "model_name": "cross-encoder/nli-deberta-v3-base"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind", "api_call": "CLIPModel.from_pretrained('laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind')", "performance": {"dataset": "ImageNet-1k", "accuracy": "79.1 - 79.4"}, "description": "A series of CLIP ConvNeXt-XXLarge models trained on LAION-2B (English), a subset of LAION-5B, using OpenCLIP. These models achieve between 79.1 and 79.4 top-1 zero-shot accuracy on ImageNet-1k. The models can be used for zero-shot image classification, image and text retrieval, and other related tasks.", "model_name": "laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "valhalla/t5-base-e2e-qg", "api_call": "pipeline('e2e-qg', model='valhalla/t5-base-e2e-qg')", "performance": {"dataset": "squad", "accuracy": "N/A"}, "description": "This is a T5-base model trained for end-to-end question generation task. Simply input the text and the model will generate multiple questions. You can play with the model using the inference API, just put the text and see the results!", "model_name": "valhalla/t5-base-e2e-qg"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-fr-es", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-fr-es')", "performance": {"dataset": "opus", "accuracy": {"BLEU": {"newssyscomb2009.fr.es": 34.3, "news-test2008.fr.es": 32.5, "newstest2009.fr.es": 31.6, "newstest2010.fr.es": 36.5, "newstest2011.fr.es": 38.3, "newstest2012.fr.es": 38.1, "newstest2013.fr.es": 34.0, "Tatoeba.fr.es": 53.2}, "chr-F": {"newssyscomb2009.fr.es": 0.601, "news-test2008.fr.es": 0.583, "newstest2009.fr.es": 0.586, "newstest2010.fr.es": 0.616, "newstest2011.fr.es": 0.622, "newstest2012.fr.es": 0.619, "newstest2013.fr.es": 0.587, "Tatoeba.fr.es": 0.709}}}, "description": "A French to Spanish translation model trained on the OPUS dataset using the Hugging Face Transformers library. The model is based on the transformer-align architecture and uses normalization and SentencePiece for pre-processing.", "model_name": "Helsinki-NLP/opus-mt-fr-es"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Document-level embeddings of research papers", "api_name": "malteos/scincl", "api_call": "AutoModel.from_pretrained('malteos/scincl')", "performance": {"dataset": "SciDocs", "accuracy": {"mag-f1": 81.2, "mesh-f1": 89.0, "co-view-map": 85.3, "co-view-ndcg": 92.2, "co-read-map": 87.7, "co-read-ndcg": 94.0, "cite-map": 93.6, "cite-ndcg": 97.4, "cocite-map": 91.7, "cocite-ndcg": 96.5, "recomm-ndcg": 54.3, "recomm-P@1": 19.6}}, "description": "SciNCL is a pre-trained BERT language model to generate document-level embeddings of research papers. It uses the citation graph neighborhood to generate samples for contrastive learning. Prior to the contrastive training, the model is initialized with weights from scibert-scivocab-uncased. The underlying citation embeddings are trained on the S2ORC citation graph.", "model_name": "malteos/scincl"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "pravsels/ddpm-ffhq-vintage-finetuned-vintage-3epochs", "api_call": "DDPMPipeline.from_pretrained('pravsels/ddpm-ffhq-vintage-finetuned-vintage-3epochs')", "performance": {"dataset": "", "accuracy": ""}, "description": "Example Fine-Tuned Model for Unit 2 of the Diffusion Models Class", "model_name": "pravsels/ddpm-ffhq-vintage-finetuned-vintage-3epochs"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "Rajaram1996/Hubert_emotion", "api_call": "HubertForSpeechClassification.from_pretrained('Rajaram1996/Hubert_emotion')", "performance": {"dataset": "unknown", "accuracy": "unknown"}, "description": "A pretrained model for predicting emotion in local audio files using Hubert.", "model_name": "Rajaram1996/Hubert_emotion"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-small-finetuned-sqa", "api_call": "pipeline('table-question-answering', model='google/tapas-small-finetuned-sqa')", "performance": {"dataset": "msr_sqa", "accuracy": 0.6155}, "description": "TAPAS small model fine-tuned on Sequential Question Answering (SQA). It uses relative position embeddings (i.e. resetting the position index at every cell of the table).", "model_name": "google/tapas-small-finetuned-sqa"}
{"domain": "Audio Automatic Speech Recognition", "framework": "PyTorch Transformers", "functionality": "Automatic Speech Recognition", "api_name": "ravirajoshi/wav2vec2-large-xls-r-300m-marathi", "api_call": "Wav2Vec2ForCTC.from_pretrained('ravirajoshi/wav2vec2-large-xls-r-300m-marathi')", "performance": {"dataset": "None", "accuracy": {"Loss": 0.5656, "Wer": 0.2156}}, "description": "This model is a fine-tuned version of facebook/wav2vec2-xls-r-300m on the None dataset. It is designed for Automatic Speech Recognition in Marathi language.", "model_name": "ravirajoshi/wav2vec2-large-xls-r-300m-marathi"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Information Retrieval", "api_name": "cross-encoder/ms-marco-TinyBERT-L-2-v2", "api_call": "AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2-v2')", "performance": {"dataset": "TREC Deep Learning 2019", "accuracy": "69.84 (NDCG@10)"}, "description": "This model was trained on the MS Marco Passage Ranking task. It can be used for Information Retrieval: Given a query, encode the query with all possible passages (e.g. retrieved with ElasticSearch). Then sort the passages in a decreasing order. The training code is available here: SBERT.net Training MS Marco.", "model_name": "cross-encoder/ms-marco-TinyBERT-L-2-v2"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification, Feature Map Extraction, Image Embeddings", "api_name": "convnext_base.fb_in1k", "api_call": "timm.create_model('convnext_base.fb_in1k', pretrained=True)", "performance": {"dataset": "imagenet-1k", "accuracy": "83.82%"}, "description": "A ConvNeXt image classification model pretrained on ImageNet-1k by paper authors. It can be used for image classification, feature map extraction, and image embeddings.", "model_name": "timm/convnext_base.fb_in1k"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Generation", "api_name": "stabilityai/stable-diffusion-2-1", "api_call": "StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-1', torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion v2-1 is a diffusion-based text-to-image generation model developed by Robin Rombach and Patrick Esser. It is capable of generating and modifying images based on text prompts in English. The model is trained on a subset of the LAION-5B dataset and is primarily intended for research purposes.", "model_name": "stabilityai/stable-diffusion-2-1"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-mini-finetuned-wtq", "api_call": "AutoModelForTableQuestionAnswering.from_pretrained('google/tapas-mini-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": 0.2854}, "description": "TAPAS mini model fine-tuned on WikiTable Questions (WTQ). It is pretrained on a large corpus of English data from Wikipedia and can be used for answering questions related to a table.", "model_name": "google/tapas-mini-finetuned-wtq"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-ViT-L-14-laion2B-s32B-b82K", "api_call": "CLIPModel.from_pretrained('laion/CLIP-ViT-L-14-laion2B-s32B-b82K')", "performance": {"dataset": "ImageNet-1k", "accuracy": 75.3}, "description": "A CLIP ViT L/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP. Intended for research purposes and exploring zero-shot, arbitrary image classification. Can be used for interdisciplinary studies of the potential impact of such model.", "model_name": "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "mio/amadeus", "api_call": "./run.sh --skip_data_prep false --skip_train true --download_model mio/amadeus", "performance": {"dataset": "amadeus", "accuracy": "Not provided"}, "description": "This model was trained by mio using amadeus recipe in espnet.", "model_name": "mio/amadeus"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "bigscience/bloom-560m", "api_call": "pipeline('text-generation', model='bigscience/bloom-560m')", "performance": {"dataset": "Validation", "accuracy": {"Training Loss": 2.0, "Validation Loss": 2.2, "Perplexity": 8.9}}, "description": "BLOOM LM is a large open-science, open-access multilingual language model developed by BigScience. It is a transformer-based language model trained on 45 natural languages and 12 programming languages. The model has 559,214,592 parameters, 24 layers, and 16 attention heads.", "model_name": "bigscience/bloom-560m"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "microsoft/deberta-v2-xxlarge", "api_call": "DebertaV2ForMaskedLM.from_pretrained('microsoft/deberta-v2-xxlarge')", "performance": {"dataset": [{"name": "SQuAD 1.1", "accuracy": "F1/EM: 96.1/91.4"}, {"name": "SQuAD 2.0", "accuracy": "F1/EM: 92.2/89.7"}, {"name": "MNLI-m/mm", "accuracy": "Acc: 91.7/91.9"}, {"name": "SST-2", "accuracy": "Acc: 97.2"}, {"name": "QNLI", "accuracy": "Acc: 96.0"}, {"name": "CoLA", "accuracy": "MCC: 72.0"}, {"name": "RTE", "accuracy": "Acc: 93.5"}, {"name": "MRPC", "accuracy": "Acc/F1: 93.1/94.9"}, {"name": "QQP", "accuracy": "Acc/F1: 92.7/90.3"}, {"name": "STS-B", "accuracy": "P/S: 93.2/93.1"}]}, "description": "DeBERTa improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. It outperforms BERT and RoBERTa on majority of NLU tasks with 80GB training data. This is the DeBERTa V2 xxlarge model with 48 layers, 1536 hidden size. The total parameters are 1.5B and it is trained with 160GB raw data.", "model_name": "microsoft/deberta-v2-xxlarge"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Generation", "api_name": "stabilityai/stable-diffusion-2", "api_call": "StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2', scheduler=EulerDiscreteScheduler.from_pretrained('stabilityai/stable-diffusion-2', subfolder=scheduler), torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion v2 is a diffusion-based text-to-image generation model that can generate and modify images based on text prompts. It uses a fixed, pretrained text encoder (OpenCLIP-ViT/H) and is primarily intended for research purposes, such as safe deployment of models with potential to generate harmful content, understanding limitations and biases of generative models, and generation of artworks for design and artistic processes.", "model_name": "stabilityai/stable-diffusion-2"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "nikcheerla/nooks-amd-detection-realtime", "api_call": "SentenceTransformer('nikcheerla/nooks-amd-detection-realtime')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "nikcheerla/nooks-amd-detection-realtime"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "donut-base-finetuned-cord-v2", "api_call": "AutoModel.from_pretrained('naver-clova-ix/donut-base-finetuned-cord-v2')", "performance": {"dataset": "CORD", "accuracy": "Not provided"}, "description": "Donut consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings (of shape batch_size, seq_len, hidden_size), after which the decoder autoregressively generates text, conditioned on the encoding of the encoder. This model is fine-tuned on CORD, a document parsing dataset.", "model_name": "naver-clova-ix/donut-base-finetuned-cord-v2"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-small-printed", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-printed')", "performance": {"dataset": "SROIE", "accuracy": "Not specified"}, "description": "TrOCR model fine-tuned on the SROIE dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of DeiT, while the text decoder was initialized from the weights of UniLM.", "model_name": "microsoft/trocr-small-printed"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "d4data/biomedical-ner-all", "api_call": "AutoModelForTokenClassification.from_pretrained('d4data/biomedical-ner-all')", "performance": {"dataset": "Maccrobat", "accuracy": "Not provided"}, "description": "An English Named Entity Recognition model, trained on Maccrobat to recognize the bio-medical entities (107 entities) from a given text corpus (case reports etc.). This model was built on top of distilbert-base-uncased.", "model_name": "d4data/biomedical-ner-all"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "shi-labs/oneformer_ade20k_swin_large", "api_call": "OneFormerForUniversalSegmentation.from_pretrained('shi-labs/oneformer_ade20k_swin_large')", "performance": {"dataset": "scene_parse_150", "accuracy": null}, "description": "OneFormer model trained on the ADE20k dataset (large-sized version, Swin backbone). It was introduced in the paper OneFormer: One Transformer to Rule Universal Image Segmentation by Jain et al. and first released in this repository. OneFormer is the first multi-task universal image segmentation framework. It needs to be trained only once with a single universal architecture, a single model, and on a single dataset, to outperform existing specialized models across semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference, all with a single model.", "model_name": "shi-labs/oneformer_ade20k_swin_large"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-zh", "api_call": "pipeline('translation_en_to_zh', model='Helsinki-NLP/opus-mt-en-zh')", "performance": {"dataset": "Tatoeba-test.eng.zho", "accuracy": {"BLEU": 31.4, "chr-F": 0.268}}, "description": "A translation model for English to Chinese using the Hugging Face Transformers library. It is based on the Marian NMT model and trained on the OPUS dataset. The model requires a sentence initial language token in the form of '>>id<<' (id = valid target language ID).", "model_name": "Helsinki-NLP/opus-mt-en-zh"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "neulab/omnitab-large-finetuned-wtq", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('neulab/omnitab-large-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": null}, "description": "OmniTab is a table-based QA model proposed in OmniTab: Pretraining with Natural and Synthetic Data for Few-shot Table-based Question Answering. The original Github repository is https://github.com/jzbjyb/OmniTab.", "model_name": "neulab/omnitab-large-finetuned-wtq"}
{"domain": "Natural Language Processing Summarization", "framework": "Transformers", "functionality": "text2text-generation", "api_name": "sshleifer/distilbart-cnn-12-6", "api_call": "BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')", "performance": {"dataset": [{"name": "cnn_dailymail", "accuracy": {"Rouge 2": "22.12", "Rouge-L": "36.99"}}]}, "description": "DistilBART is a distilled version of BART, a model for text summarization. This specific checkpoint, 'sshleifer/distilbart-cnn-12-6', is trained on the cnn_dailymail dataset and provides a fast and effective way to generate summaries of text. The model can be loaded using the Hugging Face Transformers library.", "model_name": "sshleifer/distilbart-cnn-12-6"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "layoutlm-vqa", "api_call": "pipeline('question-answering', model='pardeepSF/layoutlm-vqa')", "performance": {"dataset": "", "accuracy": ""}, "description": "A model for document question answering using the LayoutLM architecture.", "model_name": "pardeepSF/layoutlm-vqa"}
{"domain": "Audio Audio-to-Audio", "framework": "SpeechBrain", "functionality": "Audio Source Separation", "api_name": "sepformer-wsj02mix", "api_call": "separator.from_hparams(source='speechbrain/sepformer-wsj02mix')", "performance": {"dataset": "WSJ0-2Mix", "accuracy": "22.4 dB"}, "description": "This repository provides all the necessary tools to perform audio source separation with a SepFormer model, implemented with SpeechBrain, and pretrained on WSJ0-2Mix dataset.", "model_name": "speechbrain/sepformer-wsj02mix"}
{"domain": "Audio Automatic Speech Recognition", "framework": "pyannote.audio", "functionality": "Speaker Diarization", "api_name": "pyannote/speaker-diarization", "api_call": "Pipeline.from_pretrained('pyannote/speaker-diarization@2.1', use_auth_token='ACCESS_TOKEN_GOES_HERE')", "performance": {"dataset": "ami", "accuracy": {"DER%": "18.91", "FA%": "4.48", "Miss%": "9.51", "Conf%": "4.91"}}, "description": "This API provides an automatic speaker diarization pipeline using the pyannote.audio framework. It can process audio files and output speaker diarization results in RTTM format. The pipeline can also handle cases where the number of speakers is known in advance or when providing lower and/or upper bounds on the number of speakers.", "model_name": "pyannote/speaker-diarization"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-en-ru", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-ru')", "performance": {"dataset": "newstest2019-enru", "accuracy": "27.1"}, "description": "Helsinki-NLP/opus-mt-en-ru is a translation model trained on the OPUS dataset, which translates English text to Russian. It is based on the Marian NMT framework and can be used with Hugging Face Transformers.", "model_name": "Helsinki-NLP/opus-mt-en-ru"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-forklift-detection", "api_call": "YOLO('keremberke/yolov8m-forklift-detection')", "performance": {"dataset": "forklift-object-detection", "accuracy": 0.846}, "description": "A YOLOv8 model for detecting forklifts and persons in images.", "model_name": "keremberke/yolov8m-forklift-detection"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-base-handwritten", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')", "performance": {"dataset": "IAM", "accuracy": "Not specified"}, "description": "TrOCR model fine-tuned on the IAM dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of BEiT, while the text decoder was initialized from the weights of RoBERTa.", "model_name": "microsoft/trocr-base-handwritten"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Text2Text Generation", "api_name": "t5_sentence_paraphraser", "api_call": "T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_sentence_paraphraser')", "performance": {"dataset": "", "accuracy": ""}, "description": "A T5 model for paraphrasing sentences", "model_name": "ramsrigouthamg/t5_sentence_paraphraser"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "clipseg-rd64-refined", "api_call": "pipeline('image-segmentation', model='CIDAS/clipseg-rd64-refined')", "performance": {"dataset": "", "accuracy": ""}, "description": "CLIPSeg model with reduce dimension 64, refined (using a more complex convolution). It was introduced in the paper Image Segmentation Using Text and Image Prompts by L\u00fcddecke et al. and first released in this repository. This model is intended for zero-shot and one-shot image segmentation.", "model_name": "CIDAS/clipseg-rd64-refined"}
{"domain": "Computer Vision Image-to-Image", "framework": "Diffusers", "functionality": "Text-to-Image", "api_name": "lllyasviel/control_v11p_sd15_scribble", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_scribble')", "performance": {"dataset": "Stable Diffusion v1-5", "accuracy": "Not specified"}, "description": "Controlnet v1.1 is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Scribble images. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5.", "model_name": "lllyasviel/control_v11p_sd15_scribble"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Speaker segmentation, Voice activity detection, Overlapped speech detection, Resegmentation, Raw scores", "api_name": "pyannote/segmentation", "api_call": "Model.from_pretrained('pyannote/segmentation', use_auth_token='ACCESS_TOKEN_GOES_HERE')", "performance": {"dataset": {"AMI Mix-Headset": {"voice_activity_detection_accuracy": {"onset": 0.684, "offset": 0.577, "min_duration_on": 0.181, "min_duration_off": 0.037}, "overlapped_speech_detection_accuracy": {"onset": 0.448, "offset": 0.362, "min_duration_on": 0.116, "min_duration_off": 0.187}, "resegmentation_accuracy": {"onset": 0.542, "offset": 0.527, "min_duration_on": 0.044, "min_duration_off": 0.705}}, "DIHARD3": {"voice_activity_detection_accuracy": {"onset": 0.767, "offset": 0.377, "min_duration_on": 0.136, "min_duration_off": 0.067}, "overlapped_speech_detection_accuracy": {"onset": 0.43, "offset": 0.32, "min_duration_on": 0.091, "min_duration_off": 0.14400000000000002}, "resegmentation_accuracy": {"onset": 0.592, "offset": 0.489, "min_duration_on": 0.163, "min_duration_off": 0.182}}, "VoxConverse": {"voice_activity_detection_accuracy": {"onset": 0.767, "offset": 0.713, "min_duration_on": 0.182, "min_duration_off": 0.501}, "overlapped_speech_detection_accuracy": {"onset": 0.587, "offset": 0.426, "min_duration_on": 0.337, "min_duration_off": 0.112}, "resegmentation_accuracy": {"onset": 0.537, "offset": 0.724, "min_duration_on": 0.41000000000000003, "min_duration_off": 0.5630000000000001}}}}, "description": "A pre-trained model for speaker segmentation, voice activity detection, overlapped speech detection, and resegmentation using the pyannote.audio framework.", "model_name": "pyannote/segmentation"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "blip2-opt-2.7b", "api_call": "Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b')", "performance": {"dataset": "LAION", "accuracy": "Not specified"}, "description": "BLIP-2 model, leveraging OPT-2.7b (a large language model with 2.7 billion parameters). It was introduced in the paper BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models by Li et al. and first released in this repository. The goal for the model is to predict the next text token, given the query embeddings and the previous text. This allows the model to be used for tasks like image captioning, visual question answering (VQA), and chat-like conversations by feeding the image and the previous conversation as prompt to the model.", "model_name": "Salesforce/blip2-opt-2.7b"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/tts_transformer-fr-cv7_css10", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/tts_transformer-fr-cv7_css10')", "performance": {"dataset": "common_voice", "accuracy": "N/A"}, "description": "Transformer text-to-speech model from fairseq S^2. French, single-speaker male voice. Pre-trained on Common Voice v7, fine-tuned on CSS10.", "model_name": "facebook/tts_transformer-fr-cv7_css10"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-ca-es", "api_call": "MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ca-es') , MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ca-es')", "performance": {"dataset": "Tatoeba.ca.es", "accuracy": {"BLEU": 74.9, "chr-F": 0.863}}, "description": "A Hugging Face model for translation between Catalan (ca) and Spanish (es) languages, based on the OPUS dataset and using the transformer-align architecture. The model has been pre-processed with normalization and SentencePiece.", "model_name": "Helsinki-NLP/opus-mt-ca-es"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "JorisCos/ConvTasNet_Libri2Mix_sepnoisy_16k", "api_call": "BaseModel.from_pretrained('JorisCos/ConvTasNet_Libri2Mix_sepnoisy_16k')", "performance": {"dataset": "Libri2Mix", "accuracy": {"si_sdr": 10.6171309498, "si_sdr_imp": 12.551811413, "sdr": 11.2318674645, "sdr_imp": 13.0597650097, "sir": 24.461138353, "sir_imp": 24.3718564523, "sar": 11.5649982725, "sar_imp": 4.6625257058, "stoi": 0.8701085139, "stoi_imp": 0.224541802}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the sep_noisy task of the Libri2Mix dataset.", "model_name": "JorisCos/ConvTasNet_Libri2Mix_sepnoisy_16k"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "csarron/bert-base-uncased-squad-v1", "api_call": "pipeline('question-answering', model='csarron/bert-base-uncased-squad-v1', tokenizer='csarron/bert-base-uncased-squad-v1')", "performance": {"dataset": "SQuAD1.1", "accuracy": {"EM": 80.9, "F1": 88.2}}, "description": "BERT-base uncased model fine-tuned on SQuAD v1. This model is case-insensitive and does not make a difference between english and English.", "model_name": "csarron/bert-base-uncased-squad-v1"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/tts_transformer-es-css10", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/tts_transformer-es-css10')", "performance": {"dataset": "CSS10", "accuracy": null}, "description": "Transformer text-to-speech model from fairseq S^2. Spanish single-speaker male voice trained on CSS10.", "model_name": "facebook/tts_transformer-es-css10"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "padmalcom/wav2vec2-large-emotion-detection-german", "api_call": "pipeline('audio-classification', model='padmalcom/wav2vec2-large-emotion-detection-german')", "performance": {"dataset": "emo-DB", "accuracy": "Not provided"}, "description": "This wav2vec2 based emotion detection model is trained on the emo-DB dataset. It can classify emotions in German audio files into seven classes: anger, boredom, disgust, fear, happiness, sadness, and neutral.", "model_name": "padmalcom/wav2vec2-large-emotion-detection-german"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "kan-bayashi_ljspeech_vits", "api_call": "pipeline('text-to-speech', model='espnet/kan-bayashi_ljspeech_vits')", "performance": {"dataset": "ljspeech", "accuracy": "Not mentioned"}, "description": "A Text-to-Speech model trained on the ljspeech dataset using the ESPnet toolkit. This model can be used to convert text input into synthesized speech.", "model_name": "espnet/kan-bayashi_ljspeech_vits"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "CZ_DVQA_layoutxlm-base", "api_call": "LayoutXLMForQuestionAnswering.from_pretrained('fimu-docproc-research/CZ_DVQA_layoutxlm-base')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Document Question Answering model based on LayoutXLM.", "model_name": "fimu-docproc-research/CZ_DVQA_layoutxlm-base"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "flax-sentence-embeddings/all_datasets_v4_MiniLM-L6", "api_call": "SentenceTransformer('flax-sentence-embeddings/all_datasets_v4_MiniLM-L6')", "performance": {"dataset": "1,097,953,922", "accuracy": "N/A"}, "description": "The model is trained on very large sentence level datasets using a self-supervised contrastive learning objective. It is fine-tuned on a 1B sentence pairs dataset, and it aims to capture the semantic information of input sentences. The sentence vector can be used for information retrieval, clustering, or sentence similarity tasks.", "model_name": "flax-sentence-embeddings/all_datasets_v4_MiniLM-L6"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Translation, Summarization, Question Answering, Text Classification", "api_name": "t5-base", "api_call": "T5Model.from_pretrained('t5-base')", "performance": {"dataset": "c4", "accuracy": "See research paper, Table 14"}, "description": "T5-Base is a Text-To-Text Transfer Transformer (T5) model with 220 million parameters. It is designed to perform various NLP tasks, including machine translation, document summarization, question answering, and text classification. The model is pre-trained on the Colossal Clean Crawled Corpus (C4) and can be used with the Transformers library.", "model_name": "t5-base"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-MiniLM-L6-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-MiniLM-L6-v2"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "JorisCos/DPTNet_Libri1Mix_enhsingle_16k", "api_call": "pipeline('audio-to-audio', model='JorisCos/DPTNet_Libri1Mix_enhsingle_16k')", "performance": {"dataset": "Libri1Mix", "si_sdr": 14.8296700373, "si_sdr_imp": 11.3798887315, "sdr": 15.3957126447, "sdr_imp": 11.8930498455, "sir": "Infinity", "sir_imp": "NaN", "sar": 15.3957126447, "sar_imp": 11.8930498455, "stoi": 0.9301948391, "stoi_imp": 0.1342750156}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the enh_single task of the Libri1Mix dataset.", "model_name": "JorisCos/DPTNet_Libri1Mix_enhsingle_16k"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "ahotrod/electra_large_discriminator_squad2_512", "api_call": "AutoModelForQuestionAnswering.from_pretrained('ahotrod/electra_large_discriminator_squad2_512')", "performance": {"dataset": "SQuAD2.0", "accuracy": {"exact": 87.0967741935, "f1": 89.9834383272}}, "description": "ELECTRA_large_discriminator language model fine-tuned on SQuAD2.0 for question answering tasks.", "model_name": "ahotrod/electra_large_discriminator_squad2_512"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face", "functionality": "Text2Text Generation", "api_name": "facebook/mbart-large-50-many-to-many-mmt", "api_call": "MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')", "performance": {"dataset": "Multilingual Translation", "accuracy": "Not specified"}, "description": "mBART-50 many-to-many multilingual machine translation model can translate directly between any pair of 50 languages. It was introduced in the Multilingual Translation with Extensible Multilingual Pretraining and Finetuning paper.", "model_name": "facebook/mbart-large-50-many-to-many-mmt"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "microsoft/wavlm-large", "api_call": "Wav2Vec2Model.from_pretrained('microsoft/wavlm-large')", "performance": {"dataset": "SUPERB benchmark", "accuracy": "state-of-the-art performance"}, "description": "WavLM-Large is a large model pretrained on 16kHz sampled speech audio. It is built based on the HuBERT framework, with an emphasis on both spoken content modeling and speaker identity preservation. WavLM is pretrained on 60,000 hours of Libri-Light, 10,000 hours of GigaSpeech, and 24,000 hours of VoxPopuli. It achieves state-of-the-art performance on the SUPERB benchmark and brings significant improvements for various speech processing tasks on their representative benchmarks.", "model_name": "microsoft/wavlm-large"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221221-102136", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-nyu-finetuned-diode-221221-102136')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.4222, "Mae": 0.41100000000000003, "Rmse": 0.6292, "Abs Rel": 0.3778, "Log Mae": 0.1636, "Log Rmse": 0.224, "Delta1": 0.432, "Delta2": 0.6806, "Delta3": 0.8068000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221221-102136"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/roberta-base-squad2-distilled", "api_call": "AutoModel.from_pretrained('deepset/roberta-base-squad2-distilled')", "performance": {"dataset": "squad_v2", "exact": 79.8366040596, "f1": 83.9164070799}, "description": "This model is a distilled version of deepset/roberta-large-squad2, trained on SQuAD 2.0 dataset for question answering tasks. It is based on the Roberta architecture and has been fine-tuned using Haystack's distillation feature.", "model_name": "deepset/roberta-base-squad2-distilled"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "dmis-lab/biobert-base-cased-v1.2", "api_call": "pipeline('fill-mask', model='dmis-lab/biobert-base-cased-v1.2')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "BioBERT is a pre-trained biomedical language representation model for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, and question answering.", "model_name": "dmis-lab/biobert-base-cased-v1.2"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text2Text Generation", "api_name": "t5-efficient-large-nl36_fine_tune_sum_V2", "api_call": "pipeline('summarization', model='Samuel-Fipps/t5-efficient-large-nl36_fine_tune_sum_V2')", "performance": {"dataset": [{"name": "samsum", "accuracy": {"ROUGE-1": 54.933, "ROUGE-2": 31.797, "ROUGE-L": 47.006, "ROUGE-LSUM": 51.203, "loss": 1.131, "gen_len": 23.799}}, {"name": "cnn_dailymail", "accuracy": {"ROUGE-1": 34.406, "ROUGE-2": 14.127, "ROUGE-L": 24.335, "ROUGE-LSUM": 31.658, "loss": 2.446, "gen_len": 45.928}}]}, "description": "A T5-based summarization model trained on the Samsum dataset. This model can be used for text-to-text generation tasks such as summarization without adding 'summarize' to the start of the input string. It has been fine-tuned for 10K steps with a batch size of 10.", "model_name": "Samuel-Fipps/t5-efficient-large-nl36_fine_tune_sum_V2"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "deepset/roberta-base-squad2-covid", "api_call": "pipeline('question-answering', model=RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2-covid'), tokenizer=RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2-covid'))", "performance": {"dataset": "squad_v2", "accuracy": {"XVAL_EM": 0.1789099526, "XVAL_f1": 0.49925444210000003, "XVAL_top_3_recall": 0.8021327014}}, "description": "This model is a Roberta-based model fine-tuned on SQuAD-style CORD-19 annotations for the task of extractive question answering in the context of COVID-19. It can be used with the Hugging Face Transformers library for question answering tasks.", "model_name": "deepset/roberta-base-squad2-covid"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/distiluse-base-multilingual-cased-v2", "api_call": "SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 512 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/distiluse-base-multilingual-cased-v2"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Spoken Language Identification", "api_name": "TalTechNLP/voxlingua107-epaca-tdnn", "api_call": "EncoderClassifier.from_hparams(source='TalTechNLP/voxlingua107-epaca-tdnn')", "performance": {"dataset": "VoxLingua107", "accuracy": "93%"}, "description": "This is a spoken language recognition model trained on the VoxLingua107 dataset using SpeechBrain. The model uses the ECAPA-TDNN architecture that has previously been used for speaker recognition. The model can classify a speech utterance according to the language spoken. It covers 107 different languages.", "model_name": "TalTechNLP/voxlingua107-epaca-tdnn"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "cl-tohoku/bert-base-japanese-char", "api_call": "AutoModelForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-char')", "performance": {"dataset": "wikipedia", "accuracy": "N/A"}, "description": "This is a BERT model pretrained on texts in the Japanese language. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by character-level tokenization.", "model_name": "cl-tohoku/bert-base-japanese-char"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image-to-Image", "api_name": "lllyasviel/sd-controlnet-hed", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-hed')", "performance": {"dataset": "3M edge-image, caption pairs", "accuracy": "Not provided"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on HED Boundary. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-hed"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "bigcode/santacoder", "api_call": "AutoModelForCausalLM.from_pretrained('bigcode/santacoder', trust_remote_code=True)", "performance": {"dataset": "bigcode/the-stack", "accuracy": {"pass@1 on MultiPL HumanEval (Python)": 0.18, "pass@10 on MultiPL HumanEval (Python)": 0.29, "pass@100 on MultiPL HumanEval (Python)": 0.49, "pass@1 on MultiPL MBPP (Python)": 0.35000000000000003, "pass@10 on MultiPL MBPP (Python)": 0.58, "pass@100 on MultiPL MBPP (Python)": 0.77, "pass@1 on MultiPL HumanEval (JavaScript)": 0.16, "pass@10 on MultiPL HumanEval (JavaScript)": 0.27, "pass@100 on MultiPL HumanEval (JavaScript)": 0.47000000000000003, "pass@1 on MultiPL MBPP (Javascript)": 0.28, "pass@10 on MultiPL MBPP (Javascript)": 0.51, "pass@100 on MultiPL MBPP (Javascript)": 0.7000000000000001, "pass@1 on MultiPL HumanEval (Java)": 0.15, "pass@10 on MultiPL HumanEval (Java)": 0.26, "pass@100 on MultiPL HumanEval (Java)": 0.41000000000000003, "pass@1 on MultiPL MBPP (Java)": 0.28, "pass@10 on MultiPL MBPP (Java)": 0.44, "pass@100 on MultiPL MBPP (Java)": 0.59, "single_line on HumanEval FIM (Python)": 0.44, "single_line on MultiPL HumanEval FIM (Java)": 0.62, "single_line on MultiPL HumanEval FIM (JavaScript)": 0.6000000000000001, "BLEU on CodeXGLUE code-to-text (Python)": 18.13}}, "description": "The SantaCoder models are a series of 1.1B parameter models trained on the Python, Java, and JavaScript subset of The Stack (v1.1) (which excluded opt-out requests). The main model uses Multi Query Attention, was trained using near-deduplication and comment-to-code ratio as filtering criteria and using the Fill-in-the-Middle objective. In addition there are several models that were trained on datasets with different filter parameters and with architecture and objective variations.", "model_name": "bigcode/santacoder"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video Generation", "api_name": "mo-di-bear-guitar", "api_call": "TuneAVideoPipeline.from_pretrained('nitrosocke/mo-di-diffusion', unet=UNet3DConditionModel.from_pretrained('Tune-A-Video-library/mo-di-bear-guitar', subfolder='unet', torch_dtype=torch.float16), torch_dtype=torch.float16)", "performance": {"dataset": "Not mentioned", "accuracy": "Not mentioned"}, "description": "Tune-A-Video is a text-to-video generation model based on the Hugging Face framework. The model generates videos based on textual prompts in a modern Disney style.", "model_name": "nitrosocke/mo-di-diffusion"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-blood-cell-detection", "api_call": "YOLO('keremberke/yolov8m-blood-cell-detection')", "performance": {"dataset": "blood-cell-object-detection", "accuracy": 0.927}, "description": "A YOLOv8 model for blood cell detection, including Platelets, RBC, and WBC. Trained on the blood-cell-object-detection dataset.", "model_name": "keremberke/yolov8m-blood-cell-detection"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "ntrant7/sd-class-butterflies-32", "api_call": "DDPMPipeline.from_pretrained('ntrant7/sd-class-butterflies-32')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "This model is a diffusion model for unconditional image generation of cute butterflies.", "model_name": "ntrant7/sd-class-butterflies-32"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "Dizex/InstaFoodRoBERTa-NER", "api_call": "AutoModelForTokenClassification.from_pretrained('Dizex/InstaFoodRoBERTa-NER')", "performance": {"dataset": "Dizex/InstaFoodSet", "accuracy": {"f1": 0.91, "precision": 0.89, "recall": 0.93}}, "description": "InstaFoodRoBERTa-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition of Food entities on informal text (social media like). It has been trained to recognize a single entity: food (FOOD). Specifically, this model is a roberta-base model that was fine-tuned on a dataset consisting of 400 English Instagram posts related to food.", "model_name": "Dizex/InstaFoodRoBERTa-NER"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "License Plate Detection", "api_name": "keremberke/yolov5s-license-plate", "api_call": "yolov5.load('keremberke/yolov5s-license-plate')", "performance": {"dataset": "keremberke/license-plate-object-detection", "accuracy": 0.985}, "description": "A YOLOv5 based license plate detection model trained on a custom dataset.", "model_name": "keremberke/yolov5s-license-plate"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-base-finetuned-ssv2", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-base-finetuned-ssv2')", "performance": {"dataset": "Something Something v2", "accuracy": "Not provided"}, "description": "TimeSformer model pre-trained on Something Something v2. It was introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al. and first released in this repository.", "model_name": "facebook/timesformer-base-finetuned-ssv2"}
{"domain": "Audio Automatic Speech Recognition", "framework": "CTranslate2", "functionality": "Automatic Speech Recognition", "api_name": "guillaumekln/faster-whisper-large-v2", "api_call": "WhisperModel('large-v2')", "performance": {"dataset": "99 languages", "accuracy": "Not provided"}, "description": "Whisper large-v2 model for CTranslate2. This model can be used in CTranslate2 or projets based on CTranslate2 such as faster-whisper.", "model_name": "WhisperModel('large-v2')"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "distilbert-base-multilingual-cased", "api_call": "pipeline('fill-mask', model='distilbert-base-multilingual-cased')", "performance": {"dataset": [{"name": "XNLI", "accuracy": {"English": 78.2, "Spanish": 69.1, "Chinese": 64.0, "German": 66.3, "Arabic": 59.1, "Urdu": 54.7}}]}, "description": "This model is a distilled version of the BERT base multilingual model. It is trained on the concatenation of Wikipedia in 104 different languages. The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters. On average, this model, referred to as DistilmBERT, is twice as fast as mBERT-base.", "model_name": "distilbert-base-multilingual-cased"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/git-base-coco", "api_call": "pipeline('text-generation', model='microsoft/git-base-coco')", "performance": {"dataset": "COCO", "accuracy": "Refer to the paper for evaluation results."}, "description": "GIT (short for GenerativeImage2Text) model, base-sized version, fine-tuned on COCO. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is a Transformer decoder conditioned on both CLIP image tokens and text tokens. It can be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification (by simply conditioning the model on the image and asking it to generate a class for it in text).", "model_name": "microsoft/git-base-coco"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "julien-c/hotdog-not-hotdog", "api_call": "pipeline('image-classification', model='julien-c/hotdog-not-hotdog')", "performance": {"dataset": "", "accuracy": 0.8250000000000001}, "description": "A model that classifies images as hotdog or not hotdog.", "model_name": "julien-c/hotdog-not-hotdog"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "deepset/xlm-roberta-large-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/xlm-roberta-large-squad2')", "performance": {"squad_v2": {"exact_match": 81.828, "f1": 84.889}}, "description": "Multilingual XLM-RoBERTa large model for extractive question answering on various languages. Trained on SQuAD 2.0 dataset and evaluated on SQuAD dev set, German MLQA, and German XQuAD.", "model_name": "deepset/xlm-roberta-large-squad2"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transcription and Translation", "api_name": "openai/whisper-small", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')", "performance": {"dataset": "LibriSpeech (clean) test set", "accuracy": "3.432 WER"}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalize to many datasets and domains without the need for fine-tuning. It is a Transformer-based encoder-decoder model and supports transcription and translation in various languages.", "model_name": "openai/whisper-small"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "impira/layoutlm-document-qa", "api_call": "pipeline('question-answering', model=LayoutLMForQuestionAnswering.from_pretrained('impira/layoutlm-document-qa', return_dict=True))", "performance": {"dataset": ["SQuAD2.0", "DocVQA"], "accuracy": "Not provided"}, "description": "A fine-tuned version of the multi-modal LayoutLM model for the task of question answering on documents.", "model_name": "impira/layoutlm-document-qa"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Depth Estimation", "api_name": "lllyasviel/sd-controlnet-depth", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-depth')", "performance": {"dataset": "3M depth-image, caption pairs", "accuracy": "500 GPU-hours with Nvidia A100 80G using Stable Diffusion 1.5 as a base model"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Depth estimation. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-depth"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "facebook/detr-resnet-101-dc5", "api_call": "DetrForObjectDetection.from_pretrained('facebook/detr-resnet-101-dc5')", "performance": {"dataset": "COCO 2017 validation", "accuracy": "AP 44.9"}, "description": "DETR (End-to-End Object Detection) model with ResNet-101 backbone (dilated C5 stage). The model is trained on COCO 2017 object detection dataset and achieves an average precision (AP) of 44.9 on the COCO 2017 validation set.", "model_name": "facebook/detr-resnet-101-dc5"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "saltacc/anime-ai-detect", "api_call": "pipeline('image-classification', model='saltacc/anime-ai-detect')", "performance": {"dataset": "aibooru and imageboard sites", "accuracy": "96%"}, "description": "A BEiT classifier to see if anime art was made by an AI or a human.", "model_name": "saltacc/anime-ai-detect"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Masked Language Modeling", "api_name": "xlm-roberta-base", "api_call": "pipeline('fill-mask', model='xlm-roberta-base')", "performance": {"dataset": "CommonCrawl", "accuracy": "N/A"}, "description": "XLM-RoBERTa is a multilingual version of RoBERTa pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. It can be used for masked language modeling and is intended to be fine-tuned on a downstream task.", "model_name": "xlm-roberta-base"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "Speech-to-speech translation", "api_name": "xm_transformer_unity_hk-en", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/xm_transformer_unity_hk-en')", "performance": {"dataset": ["TED", "drama", "TAT"], "accuracy": "Not specified"}, "description": "A speech-to-speech translation model with two-pass decoder (UnitY) trained on Hokkien-English data from TED, drama, and TAT domains. It uses Facebook's Unit HiFiGAN for speech synthesis.", "model_name": "facebook/xm_transformer_unity_hk-en"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Table Extraction", "api_name": "keremberke/yolov8m-table-extraction", "api_call": "YOLO('keremberke/yolov8m-table-extraction')", "performance": {"dataset": "table-extraction", "accuracy": 0.9520000000000001}, "description": "A YOLOv8 model for table extraction in images, capable of detecting both bordered and borderless tables. Trained using the keremberke/table-extraction dataset.", "model_name": "keremberke/yolov8m-table-extraction"}
{"domain": "Natural Language Processing Token Classification", "framework": "Flair", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english-ontonotes-large", "api_call": "SequenceTagger.load('flair/ner-english-ontonotes-large')", "performance": {"dataset": "Ontonotes", "accuracy": 90.93}, "description": "English NER in Flair (Ontonotes large model). This is the large 18-class NER model for English that ships with Flair. It predicts 18 tags such as cardinal value, date value, event name, building name, geo-political entity, language name, law name, location name, money name, affiliation, ordinal value, organization name, percent value, person name, product name, quantity value, time value, and name of work of art. The model is based on document-level XLM-R embeddings and FLERT.", "model_name": "flair/ner-english-ontonotes-large"}
{"domain": "Audio Voice Activity Detection", "framework": "pyannote.audio", "functionality": "Automatic Speech Recognition", "api_name": "pyannote/voice-activity-detection", "api_call": "Pipeline.from_pretrained('pyannote/voice-activity-detection')", "performance": {"dataset": "ami", "accuracy": "Not specified"}, "description": "A pretrained voice activity detection pipeline that detects active speech in audio files.", "model_name": "pyannote/voice-activity-detection"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Visual Question Answering", "api_name": "Salesforce/blip-vqa-capfilt-large", "api_call": "BlipForQuestionAnswering.from_pretrained('Salesforce/blip-vqa-capfilt-large')", "performance": {"dataset": "VQA", "accuracy": "+1.6% in VQA score"}, "description": "BLIP is a new Vision-Language Pre-training (VLP) framework that transfers flexibly to both vision-language understanding and generation tasks. It effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. The model achieves state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval, image captioning, and VQA.", "model_name": "Salesforce/blip-vqa-capfilt-large"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-ViT-g-14-laion2B-s34B-b88K", "api_call": "pipeline('zero-shot-image-classification', model='laion/CLIP-ViT-g-14-laion2B-s34B-b88K')", "performance": {"dataset": null, "accuracy": null}, "description": "A zero-shot image classification model based on OpenCLIP, which can classify images into various categories without requiring any training data for those categories.", "model_name": "laion/CLIP-ViT-g-14-laion2B-s34B-b88K"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-csgo-player-detection", "api_call": "YOLO('keremberke/yolov8m-csgo-player-detection')", "performance": {"dataset": "csgo-object-detection", "accuracy": 0.892}, "description": "An object detection model trained to detect Counter-Strike: Global Offensive (CS:GO) players. The model is based on the YOLOv8 architecture and can identify 'ct', 'cthead', 't', and 'thead' labels.", "model_name": "keremberke/yolov8m-csgo-player-detection"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/beit-base-patch16-224", "api_call": "BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')", "performance": {"dataset": "ImageNet", "accuracy": "Refer to tables 1 and 2 of the original paper"}, "description": "BEiT model pre-trained in a self-supervised fashion on ImageNet-21k (14 million images, 21,841 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 224x224.", "model_name": "microsoft/beit-base-patch16-224"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "cambridgeltl/SapBERT-from-PubMedBERT-fulltext", "api_call": "AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')", "performance": {"dataset": "UMLS", "accuracy": "N/A"}, "description": "SapBERT is a pretraining scheme that self-aligns the representation space of biomedical entities. It is trained with UMLS 2020AA (English only) and uses microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext as the base model. The input should be a string of biomedical entity names, and the [CLS] embedding of the last layer is regarded as the output.", "model_name": "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-es", "api_call": "pipeline('translation_en_to_es', model='Helsinki-NLP/opus-mt-en-es')", "performance": {"dataset": "Tatoeba-test.eng.spa", "accuracy": 54.9}, "description": "This model is a translation model from English to Spanish using the Hugging Face Transformers library. It is based on the Marian framework and trained on the OPUS dataset. The model achieves a BLEU score of 54.9 on the Tatoeba test set.", "model_name": "Helsinki-NLP/opus-mt-en-es"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-english-fast", "api_call": "SequenceTagger.load('flair/ner-english-fast')", "performance": {"dataset": "conll2003", "accuracy": "F1-Score: 92.92"}, "description": "This is the fast 4-class NER model for English that ships with Flair. It predicts 4 tags: PER (person name), LOC (location name), ORG (organization name), and MISC (other name). The model is based on Flair embeddings and LSTM-CRF.", "model_name": "flair/ner-english-fast"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/nli-mpnet-base-v2", "api_call": "SentenceTransformer('sentence-transformers/nli-mpnet-base-v2')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/nli-mpnet-base-v2"}
{"domain": "Natural Language Processing Question Answering", "framework": "PyTorch Transformers", "functionality": "Question Answering", "api_name": "deepset/bert-medium-squad2-distilled", "api_call": "AutoModel.from_pretrained('deepset/bert-medium-squad2-distilled')", "performance": {"dataset": "squad_v2", "exact": 68.6431398972, "f1": 72.7637083791}, "description": "This model is a distilled version of deepset/bert-large-uncased-whole-word-masking-squad2, trained on the SQuAD 2.0 dataset for question answering tasks. It is based on the BERT-medium architecture and uses the Hugging Face Transformers library.", "model_name": "deepset/bert-medium-squad2-distilled"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "Intel/dpt-hybrid-midas", "api_call": "DPTForDepthEstimation.from_pretrained('Intel/dpt-hybrid-midas', low_cpu_mem_usage=True)", "performance": {"dataset": "MIX 6", "accuracy": "11.06"}, "description": "Dense Prediction Transformer (DPT) model trained on 1.4 million images for monocular depth estimation. Introduced in the paper Vision Transformers for Dense Prediction by Ranftl et al. (2021) and first released in this repository. DPT uses the Vision Transformer (ViT) as backbone and adds a neck + head on top for monocular depth estimation. This repository hosts the hybrid version of the model as stated in the paper. DPT-Hybrid diverges from DPT by using ViT-hybrid as a backbone and taking some activations from the backbone.", "model_name": "Intel/dpt-hybrid-midas"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Denoising Diffusion Probabilistic Models (DDPM)", "api_name": "google/ddpm-bedroom-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-bedroom-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception score": 9.46, "FID score": 3.17}}, "description": "We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.", "model_name": "google/ddpm-bedroom-256"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-mpnet-base-v2", "api_call": "SentenceTransformer('sentence-transformers/all-mpnet-base-v2')", "performance": {"dataset": [{"name": "MS Marco", "accuracy": "Not provided"}]}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-mpnet-base-v2"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face", "functionality": "Visual Question Answering", "api_name": "JosephusCheung/GuanacoVQAOnConsumerHardware", "api_call": "pipeline('visual-question-answering', model='JosephusCheung/GuanacoVQAOnConsumerHardware')", "performance": {"dataset": "JosephusCheung/GuanacoVQADataset", "accuracy": "unknown"}, "description": "A Visual Question Answering model trained on the GuanacoVQADataset, designed to work on consumer hardware like Colab Free T4 GPU. The model can be used to answer questions about images.", "model_name": "JosephusCheung/GuanacoVQAOnConsumerHardware"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "Rakib/roberta-base-on-cuad", "api_call": "AutoModelForQuestionAnswering.from_pretrained('Rakib/roberta-base-on-cuad')", "performance": {"dataset": "cuad", "accuracy": "46.6%"}, "description": "This model is trained for the task of Question Answering on Legal Documents using the CUAD dataset. It is based on the RoBERTa architecture and can be used to extract answers from legal contracts and documents.", "model_name": "Rakib/roberta-base-on-cuad"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "optimum/roberta-base-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact": 79.8702939442, "f1": 82.9125116958}}, "description": "This is an ONNX conversion of the deepset/roberta-base-squad2 model for extractive question answering. It is trained on the SQuAD 2.0 dataset and is compatible with the Transformers library.", "model_name": "deepset/roberta-base-squad2"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Image Synthesis", "api_name": "google/ddpm-cifar10-32", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-cifar10-32').", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) is a class of latent variable models inspired by nonequilibrium thermodynamics. It is used for high-quality image synthesis. The model supports different noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm.", "model_name": "google/ddpm-cifar10-32"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "Realistic_Vision_V1.4", "api_call": "pipeline('text-to-image', model=SG161222/Realistic_Vision_V1.4)", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Realistic_Vision_V1.4 is a text-to-image model that generates high-quality and detailed images based on textual prompts. It can be used for various applications such as generating realistic portraits, landscapes, and other types of images.", "model_name": "SG161222/Realistic_Vision_V1.4"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "GradientBoostingRegressor", "api_name": "Fish-Weight", "api_call": "load('path_to_folder/example.pkl')", "performance": {"dataset": "Fish dataset", "accuracy": "Not provided"}, "description": "This is a GradientBoostingRegressor on a fish dataset. This model is intended for educational purposes.", "model_name": "path_to_folder/example.pkl"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "gpt2", "api_call": "pipeline('text-generation', model='gpt2')", "performance": {"dataset": {"LAMBADA": {"accuracy": "35.13"}, "CBT-CN": {"accuracy": "45.99"}, "CBT-NE": {"accuracy": "87.65"}, "WikiText2": {"accuracy": "83.4"}, "PTB": {"accuracy": "29.41"}, "enwiki8": {"accuracy": "65.85"}, "text8": {"accuracy": "1.16"}, "WikiText103": {"accuracy": "1.17"}, "1BW": {"accuracy": "37.50"}}}, "description": "GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.", "model_name": "gpt2"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Masked Language Modeling", "api_name": "bert-base-chinese", "api_call": "AutoModelForMaskedLM.from_pretrained('bert-base-chinese')", "performance": {"dataset": "[More Information Needed]", "accuracy": "[More Information Needed]"}, "description": "This model has been pre-trained for Chinese, training and random input masking has been applied independently to word pieces (as in the original BERT paper). It can be used for masked language modeling.", "model_name": "bert-base-chinese"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Fill-Mask", "api_name": "microsoft/deberta-v3-base", "api_call": "DebertaModel.from_pretrained('microsoft/deberta-v3-base')", "performance": {"dataset": {"SQuAD 2.0": {"F1": 88.4, "EM": 85.4}, "MNLI-m/mm": {"ACC": "90.6/90.7"}}}, "description": "DeBERTa V3 improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. It further improves the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. The DeBERTa V3 base model comes with 12 layers and a hidden size of 768. It has only 86M backbone parameters with a vocabulary containing 128K tokens which introduces 98M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2.", "model_name": "microsoft/deberta-v3-base"}
{"domain": "Tabular Tabular Classification", "framework": "Joblib", "functionality": "Transformers", "api_name": "abhishek/autotrain-adult-census-xgboost", "api_call": "AutoModel.from_pretrained('abhishek/autotrain-adult-census-xgboost')", "performance": {"dataset": "scikit-learn/adult-census-income", "accuracy": 0.8750191924}, "description": "A binary classification model trained on the Adult Census Income dataset using the XGBoost algorithm. The model predicts whether an individual's income is above or below $50,000 per year.", "model_name": "abhishek/autotrain-adult-census-xgboost"}
{"domain": "Multimodal Document Question Answer", "framework": "Transformers", "functionality": "Document Question Answering", "api_name": "tiny-random-LayoutLMForQuestionAnswering", "api_call": "AutoModelForQuestionAnswering.from_pretrained('hf-tiny-model-private/tiny-random-LayoutLMForQuestionAnswering')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random LayoutLM model for question answering. This model is not pretrained and serves as an example for the LayoutLM architecture.", "model_name": "hf-tiny-model-private/tiny-random-LayoutLMForQuestionAnswering"}
{"domain": "Natural Language Processing Conversational", "framework": "PyTorch Transformers", "functionality": "text-generation", "api_name": "satvikag/chatbot", "api_call": "AutoModelWithLMHead.from_pretrained('output-small')", "performance": {"dataset": "Kaggle game script dataset", "accuracy": "Not provided"}, "description": "DialoGPT Trained on the Speech of a Game Character, Joshua from The World Ends With You.", "model_name": "output-small"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "Jean-Baptiste/roberta-large-ner-english", "api_call": "AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/roberta-large-ner-english')", "performance": {"dataset": "conll2003", "accuracy": {"PER": {"precision": 0.9914000000000001, "recall": 0.9927, "f1": 0.992}, "ORG": {"precision": 0.9627, "recall": 0.9661000000000001, "f1": 0.9644}, "LOC": {"precision": 0.9795, "recall": 0.9862000000000001, "f1": 0.9828}, "MISC": {"precision": 0.9292, "recall": 0.9262, "f1": 0.9277000000000001}, "Overall": {"precision": 0.974, "recall": 0.9766, "f1": 0.9753000000000001}}}, "description": "roberta-large-ner-english is an english NER model that was fine-tuned from roberta-large on conll2003 dataset. Model was validated on emails/chat data and outperformed other models on this type of data specifically. In particular, the model seems to work better on entities that don't start with an upper case.", "model_name": "Jean-Baptiste/roberta-large-ner-english"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/paraphrase-albert-small-v2", "api_call": "SentenceTransformer('sentence-transformers/paraphrase-albert-small-v2')", "performance": {"dataset": ["snli", "multi_nli", "ms_marco"], "accuracy": "https://seb.sbert.net"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/paraphrase-albert-small-v2"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "sayakpaul/videomae-base-finetuned-ucf101-subset", "api_call": "AutoModelForVideoClassification.from_pretrained('sayakpaul/videomae-base-finetuned-ucf101-subset')", "performance": {"dataset": "unknown", "accuracy": 0.8645}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base on an unknown dataset. It achieves the following results on the evaluation set: Loss: 0.3992, Accuracy: 0.8645.", "model_name": "sayakpaul/videomae-base-finetuned-ucf101-subset"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "NLI-based Zero Shot Text Classification", "api_name": "facebook/bart-large-mnli", "api_call": "AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')", "performance": {"dataset": "multi_nli", "accuracy": "Not specified"}, "description": "This is the checkpoint for bart-large after being trained on the MultiNLI (MNLI) dataset. The model can be used for zero-shot text classification by posing the sequence to be classified as the NLI premise and constructing a hypothesis from each candidate label. The probabilities for entailment and contradiction are then converted to label probabilities.", "model_name": "facebook/bart-large-mnli"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "setu4993/LaBSE", "api_call": "BertModel.from_pretrained('setu4993/LaBSE')", "performance": {"dataset": "CommonCrawl and Wikipedia", "accuracy": "Not Specified"}, "description": "Language-agnostic BERT Sentence Encoder (LaBSE) is a BERT-based model trained for sentence embedding for 109 languages. The pre-training process combines masked language modeling with translation language modeling. The model is useful for getting multilingual sentence embeddings and for bi-text retrieval.", "model_name": "setu4993/LaBSE"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "deepset/minilm-uncased-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/minilm-uncased-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact": 76.1307167523, "f1": 79.4978650022}}, "description": "MiniLM-L12-H384-uncased is a language model fine-tuned for extractive question answering on the SQuAD 2.0 dataset. It is based on the microsoft/MiniLM-L12-H384-uncased model and can be used with the Hugging Face Transformers library.", "model_name": "deepset/minilm-uncased-squad2"}
{"domain": "Multimodal Image-to-Text", "framework": "Transformers", "functionality": "Image Captioning", "api_name": "blip-image-captioning-large", "api_call": "BlipForConditionalGeneration.from_pretrained(Salesforce/blip-image-captioning-large)", "performance": {"dataset": "COCO", "accuracy": {"image-text retrieval": "+2.7% recall@1", "image captioning": "+2.8% CIDEr", "VQA": "+1.6% VQA score"}}, "description": "BLIP is a Vision-Language Pre-training (VLP) framework that achieves state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval, image captioning, and VQA. It effectively utilizes noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones.", "model_name": "Salesforce/blip-image-captioning-large"}
{"domain": "Reinforcement Learning", "framework": "Unity ML-Agents", "functionality": "Train and play SoccerTwos", "api_name": "Raiden-1001/poca-Soccerv7.1", "api_call": "mlagents-load-from-hf --repo-id='Raiden-1001/poca-Soccerv7.1' --local-dir='./downloads'", "performance": {"dataset": "SoccerTwos", "accuracy": "Not provided"}, "description": "A trained model of a poca agent playing SoccerTwos using the Unity ML-Agents Library.", "model_name": "Raiden-1001/poca-Soccerv7.1"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "wav2vec2-random-tiny-classifier", "api_call": "pipeline('audio-classification', model=Wav2Vec2ForCTC.from_pretrained('anton-l/wav2vec2-random-tiny-classifier'))", "performance": {"dataset": "", "accuracy": ""}, "description": "An audio classification model based on wav2vec2.", "model_name": "anton-l/wav2vec2-random-tiny-classifier"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "vision-encoder-decoder", "api_name": "jinhybr/OCR-DocVQA-Donut", "api_call": "pipeline('document-question-answering', model='jinhybr/OCR-DocVQA-Donut')", "performance": {"dataset": "DocVQA", "accuracy": "Not provided"}, "description": "Donut model fine-tuned on DocVQA. It consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings, after which the decoder autoregressively generates text, conditioned on the encoding of the encoder.", "model_name": "jinhybr/OCR-DocVQA-Donut"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Voice Activity Detection", "api_name": "anilbs/segmentation", "api_call": "VoiceActivityDetection(segmentation='anilbs/segmentation')", "performance": {"dataset": [{"name": "AMI Mix-Headset", "accuracy": {"onset": 0.684, "offset": 0.577, "min_duration_on": 0.181, "min_duration_off": 0.037}}, {"name": "DIHARD3", "accuracy": {"onset": 0.767, "offset": 0.377, "min_duration_on": 0.136, "min_duration_off": 0.067}}, {"name": "VoxConverse", "accuracy": {"onset": 0.767, "offset": 0.713, "min_duration_on": 0.182, "min_duration_off": 0.501}}]}, "description": "Model from End-to-end speaker segmentation for overlap-aware resegmentation, by Herv\u00e9 Bredin and Antoine Laurent. Online demo is available as a Hugging Face Space.", "model_name": "anilbs/segmentation"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text2Text Generation", "api_name": "google/flan-t5-xxl", "api_call": "T5ForConditionalGeneration.from_pretrained('google/flan-t5-xxl')", "performance": {"dataset": [{"name": "MMLU", "accuracy": "75.2%"}]}, "description": "FLAN-T5 XXL is a fine-tuned version of the T5 language model, achieving state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. It has been fine-tuned on more than 1000 additional tasks covering multiple languages, including English, German, and French. It can be used for research on zero-shot NLP tasks and in-context few-shot learning NLP tasks, such as reasoning and question answering.", "model_name": "google/flan-t5-xxl"}
{"domain": "Natural Language Processing Text Generation", "framework": "PyTorch Transformers", "functionality": "Text Generation", "api_name": "decapoda-research/llama-7b-hf", "api_call": "AutoModel.from_pretrained('decapoda-research/llama-7b-hf')", "performance": {"dataset": [{"name": "BoolQ", "accuracy": 76.5}, {"name": "PIQA", "accuracy": 79.8}, {"name": "SIQA", "accuracy": 48.9}, {"name": "HellaSwag", "accuracy": 76.1}, {"name": "WinoGrande", "accuracy": 70.1}, {"name": "ARC-e", "accuracy": 76.7}, {"name": "ARC-c", "accuracy": 47.6}, {"name": "OBQAC", "accuracy": 57.2}, {"name": "COPA", "accuracy": 93}]}, "description": "LLaMA-7B is an auto-regressive language model based on the transformer architecture. It is designed for research on large language models, including question answering, natural language understanding, and reading comprehension. The model is trained on various sources, including CCNet, C4, GitHub, Wikipedia, Books, ArXiv, and Stack Exchange, with the majority of the dataset being in English.", "model_name": "decapoda-research/llama-7b-hf"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "facebook/bart-base", "api_call": "BartModel.from_pretrained('facebook/bart-base')", "performance": {"dataset": "arxiv", "accuracy": "Not provided"}, "description": "BART is a transformer encoder-decoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering).", "model_name": "facebook/bart-base"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg')", "performance": {"dataset": "ImageNet-1k", "accuracy": "70.8-71.7%"}, "description": "A series of CLIP ConvNeXt-Base (w/ wide embed dim) models trained on subsets LAION-5B using OpenCLIP. The models utilize the timm ConvNeXt-Base model (convnext_base) as the image tower, and the same text tower as the RN50x4 (depth 12, embed dim 640) model from OpenAI CLIP.", "model_name": "laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "zero-shot-object-detection", "api_name": "google/owlvit-large-patch14", "api_call": "OwlViTForObjectDetection.from_pretrained('google/owlvit-large-patch14')", "performance": {"dataset": "COCO", "accuracy": "Not specified"}, "description": "OWL-ViT is a zero-shot text-conditioned object detection model that can be used to query an image with one or multiple text queries. It uses CLIP as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. OWL-ViT is trained on publicly available image-caption data and fine-tuned on publicly available object detection datasets such as COCO and OpenImages.", "model_name": "google/owlvit-large-patch14"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video Synthesis", "api_name": "damo-vilab/text-to-video-ms-1.7b-legacy", "api_call": "DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b-legacy', torch_dtype=torch.float16)", "performance": {"dataset": ["LAION5B", "ImageNet", "Webvid"], "accuracy": "Not provided"}, "description": "This model is based on a multi-stage text-to-video generation diffusion model, which inputs a description text and returns a video that matches the text description. Only English input is supported.", "model_name": "damo-vilab/text-to-video-ms-1.7b-legacy"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "google/pix2struct-chartqa-base", "api_call": "Pix2StructForConditionalGeneration.from_pretrained('google/pix2struct-chartqa-base')", "performance": {"dataset": "ChartQA", "accuracy": "Not provided"}, "description": "Pix2Struct is an image encoder - text decoder model that is trained on image-text pairs for various tasks, including image captionning and visual question answering. The model is pretrained by learning to parse masked screenshots of web pages into simplified HTML. It can achieve state-of-the-art results in six out of nine tasks across four domains: documents, illustrations, user interfaces, and natural images.", "model_name": "google/pix2struct-chartqa-base"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "layoutlmv3-base-mpdocvqa", "api_call": "LayoutLMv3ForQuestionAnswering.from_pretrained('rubentito/layoutlmv3-base-mpdocvqa')", "performance": {"dataset": "rubentito/mp-docvqa", "accuracy": {"ANLS": 0.45380000000000004, "APPA": 51.9426}}, "description": "This is pretrained LayoutLMv3 from Microsoft hub and fine-tuned on Multipage DocVQA (MP-DocVQA) dataset. This model was used as a baseline in Hierarchical multimodal transformers for Multi-Page DocVQA.", "model_name": "rubentito/layoutlmv3-base-mpdocvqa"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face", "functionality": "Conversational", "api_name": "facebook/blenderbot_small-90M", "api_call": "BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not provided"}, "description": "Blenderbot is a chatbot model that provides engaging talking points and listens to their partners, both asking and answering questions, and displaying knowledge, empathy, and personality appropriately, depending on the situation.", "model_name": "facebook/blenderbot_small-90M"}
{"domain": "Natural Language Processing Summarization", "framework": "Transformers", "functionality": "text2text-generation", "api_name": "financial-summarization-pegasus", "api_call": "PegasusForConditionalGeneration.from_pretrained('human-centered-summarization/financial-summarization-pegasus')", "performance": {"dataset": "xsum", "accuracy": {"ROUGE-1": 35.206, "ROUGE-2": 16.569, "ROUGE-L": 30.128, "ROUGE-LSUM": 30.171}}, "description": "This model was fine-tuned on a novel financial news dataset, which consists of 2K articles from Bloomberg, on topics such as stock, markets, currencies, rate and cryptocurrencies. It is based on the PEGASUS model and in particular PEGASUS fine-tuned on the Extreme Summarization (XSum) dataset: google/pegasus-xsum model. PEGASUS was originally proposed by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu in PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization.", "model_name": "human-centered-summarization/financial-summarization-pegasus"}
{"domain": "Multimodal Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Geolocalization", "api_name": "geolocal/StreetCLIP", "api_call": "CLIPModel.from_pretrained('geolocal/StreetCLIP')", "performance": {"dataset": [{"name": "IM2GPS", "accuracy": {"25km": 28.3, "200km": 45.1, "750km": 74.7, "2500km": 88.2}}, {"name": "IM2GPS3K", "accuracy": {"25km": 22.4, "200km": 37.4, "750km": 61.3, "2500km": 80.4}}]}, "description": "StreetCLIP is a robust foundation model for open-domain image geolocalization and other geographic and climate-related tasks. Trained on an original dataset of 1.1 million street-level urban and rural geo-tagged images, it achieves state-of-the-art performance on multiple open-domain image geolocalization benchmarks in zero-shot, outperforming supervised models trained on millions of images.", "model_name": "geolocal/StreetCLIP"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "pygmalion-6b", "api_call": "AutoModelForCausalLM.from_pretrained('pygmalion-6b')", "performance": {"dataset": "56MB of dialogue data", "accuracy": "Not specified"}, "description": "Pygmalion 6B is a proof-of-concept dialogue model based on EleutherAI's GPT-J-6B. The fine-tuning dataset consisted of 56MB of dialogue data gathered from multiple sources, which includes both real and partially machine-generated conversations. The model was initialized from the uft-6b ConvoGPT model and fine-tuned on ~48.5 million tokens for ~5k steps on 4 NVIDIA A40s using DeepSpeed.", "model_name": "pygmalion-6b"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn", "api_call": "Wav2Vec2Model.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn')", "performance": {"dataset": "Common Voice zh-CN", "accuracy": {"WER": 82.37, "CER": 19.03}}, "description": "Fine-tuned XLSR-53 large model for speech recognition in Chinese. Fine-tuned facebook/wav2vec2-large-xlsr-53 on Chinese using the train and validation splits of Common Voice 6.1, CSS10 and ST-CMDS.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-small-finetuned-wtq", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-small-finetuned-wtq'), TapasTokenizer.from_pretrained('google/tapas-small-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": 0.37620000000000003}, "description": "TAPAS small model fine-tuned on WikiTable Questions (WTQ). This model was pre-trained on MLM and an additional step which the authors call intermediate pre-training, and then fine-tuned in a chain on SQA, WikiSQL and finally WTQ. It uses relative position embeddings (i.e. resetting the position index at every cell of the table).", "model_name": "google/tapas-small-finetuned-wtq"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "audio", "api_name": "textless_sm_cs_en", "api_call": "Wav2Vec2Model.from_pretrained(cached_download('https://huggingface.co/facebook/textless_sm_cs_en/resolve/main/model.pt'))", "performance": {"dataset": "", "accuracy": ""}, "description": "A speech-to-speech translation model for converting between languages without using text as an intermediate representation. This model is designed for the task of audio-to-audio translation.", "model_name": "huggingface.co/facebook"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/gtr-t5-base", "api_call": "SentenceTransformer('sentence-transformers/gtr-t5-base')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "N/A"}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of semantic search.", "model_name": "sentence-transformers/gtr-t5-base"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "dperales/layoutlmv2-base-uncased_finetuned_docvqa", "api_call": "LayoutLMv2ForQuestionAnswering.from_pretrained('dperales/layoutlmv2-base-uncased_finetuned_docvqa')", "performance": {"dataset": "", "accuracy": ""}, "description": "A model for Document Question Answering based on the LayoutLMv2 architecture, fine-tuned on the DocVQA dataset.", "model_name": "dperales/layoutlmv2-base-uncased_finetuned_docvqa"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-small-stage1", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-stage1')", "performance": {"dataset": "IAM", "accuracy": "Not provided"}, "description": "TrOCR pre-trained only model. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of DeiT, while the text decoder was initialized from the weights of UniLM. Images are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder. Next, the Transformer text decoder autoregressively generates tokens.", "model_name": "microsoft/trocr-small-stage1"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "microsoft/xclip-base-patch16-zero-shot", "api_call": "XClipModel.from_pretrained('microsoft/xclip-base-patch16-zero-shot')", "performance": {"dataset": [{"name": "HMDB-51", "accuracy": 44.6}, {"name": "UCF-101", "accuracy": 72.0}, {"name": "Kinetics-600", "accuracy": 65.2}]}, "description": "X-CLIP is a minimal extension of CLIP for general video-language understanding. The model is trained in a contrastive way on (video, text) pairs. This allows the model to be used for tasks like zero-shot, few-shot or fully supervised video classification and video-text retrieval.", "model_name": "microsoft/xclip-base-patch16-zero-shot"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "tiennvcs/layoutlmv2-base-uncased-finetuned-vi-infovqa", "api_call": "pipeline('question-answering', model='tiennvcs/layoutlmv2-base-uncased-finetuned-vi-infovqa')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 4.3332}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-base-uncased on an unknown dataset.", "model_name": "tiennvcs/layoutlmv2-base-uncased-finetuned-vi-infovqa"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "ConvTasNet_Libri2Mix_sepclean_8k", "api_call": "hf_hub_download(repo_id='JorisCos/ConvTasNet_Libri2Mix_sepclean_8k')", "performance": {"dataset": "Libri2Mix", "accuracy": {"si_sdr": 14.7645436345, "si_sdr_imp": 14.7640293756, "sdr": 15.2933797075, "sdr_imp": 15.1141466051, "sir": 24.0929046611, "sir_imp": 23.9136696831, "sar": 16.0605590692, "sar_imp": -51.9807844413, "stoi": 0.9311142441, "stoi_imp": 0.2181737614}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the sep_clean task of the Libri2Mix dataset.", "model_name": "JorisCos/ConvTasNet_Libri2Mix_sepclean_8k"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8m-pcb-defect-segmentation", "api_call": "YOLO('keremberke/yolov8m-pcb-defect-segmentation')", "performance": {"dataset": "pcb-defect-segmentation", "accuracy": {"mAP@0.5(box)": 0.5680000000000001, "mAP@0.5(mask)": 0.557}}, "description": "A YOLOv8 model for PCB defect segmentation trained on the pcb-defect-segmentation dataset. The model can detect and segment defects in PCB images, such as Dry_joint, Incorrect_installation, PCB_damage, and Short_circuit.", "model_name": "keremberke/yolov8m-pcb-defect-segmentation"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Table Extraction", "api_name": "keremberke/yolov8n-table-extraction", "api_call": "YOLO('keremberke/yolov8n-table-extraction')", "performance": {"dataset": "table-extraction", "accuracy": 0.967}, "description": "An object detection model for extracting tables from documents. Supports two label types: 'bordered' and 'borderless'.", "model_name": "keremberke/yolov8n-table-extraction"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221215-112116", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221215-112116')", "performance": {"dataset": "DIODE", "accuracy": ""}, "description": "A depth estimation model fine-tuned on the DIODE dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221215-112116"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "flair/ner-german", "api_call": "SequenceTagger.load('flair/ner-german')", "performance": {"dataset": "conll2003", "accuracy": "87.94"}, "description": "This is the standard 4-class NER model for German that ships with Flair. It predicts 4 tags: PER (person name), LOC (location name), ORG (organization name), and MISC (other name). The model is based on Flair embeddings and LSTM-CRF.", "model_name": "flair/ner-german"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Text Classification", "api_name": "svalabs/gbert-large-zeroshot-nli", "api_call": "pipeline('zero-shot-classification', model='svalabs/gbert-large-zeroshot-nli')", "performance": {"dataset": "XNLI TEST-Set", "accuracy": "85.6%"}, "description": "A German zeroshot classification model based on the German BERT large model from deepset.ai and finetuned for natural language inference using machine-translated nli sentence pairs from mnli, anli, and snli datasets.", "model_name": "svalabs/gbert-large-zeroshot-nli"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "BaptisteDoyen/camembert-base-xnli", "api_call": "pipeline('zero-shot-classification', model='BaptisteDoyen/camembert-base-xnli')", "performance": {"dataset": "xnli", "accuracy": {"validation": 81.4, "test": 81.7}}, "description": "Camembert-base model fine-tuned on french part of XNLI dataset. One of the few Zero-Shot classification models working on French.", "model_name": "BaptisteDoyen/camembert-base-xnli"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Generate and modify images based on text prompts", "api_name": "stabilityai/stable-diffusion-2-depth", "api_call": "StableDiffusionDepth2ImgPipeline.from_pretrained('stabilityai/stable-diffusion-2-depth', torch_dtype=torch.float16)", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion v2 is a latent diffusion model that generates and modifies images based on text prompts. It uses a fixed, pretrained text encoder (OpenCLIP-ViT/H) and is developed by Robin Rombach and Patrick Esser. The model works with English language prompts and is intended for research purposes only.", "model_name": "stabilityai/stable-diffusion-2-depth"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Question Generation", "api_name": "mrm8488/t5-base-finetuned-question-generation-ap", "api_call": "AutoModelWithLMHead.from_pretrained('mrm8488/t5-base-finetuned-question-generation-ap')", "performance": {"dataset": "SQuAD", "accuracy": "Not provided"}, "description": "Google's T5 model fine-tuned on SQuAD v1.1 for Question Generation by prepending the answer to the context.", "model_name": "mrm8488/t5-base-finetuned-question-generation-ap"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Contextual Representation", "api_name": "indobenchmark/indobert-base-p1", "api_call": "AutoModel.from_pretrained('indobenchmark/indobert-base-p1')", "performance": {"dataset": "Indo4B", "accuracy": "23.43 GB of text"}, "description": "IndoBERT is a state-of-the-art language model for Indonesian based on the BERT model. The pretrained model is trained using a masked language modeling (MLM) objective and next sentence prediction (NSP) objective.", "model_name": "indobenchmark/indobert-base-p1"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-video-synthesis", "api_name": "damo-vilab/text-to-video-ms-1.7b", "api_call": "DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b', torch_dtype=torch.float16, variant=fp16)", "performance": {"dataset": "Webvid", "accuracy": "Not specified"}, "description": "A multi-stage text-to-video generation diffusion model that inputs a description text and returns a video that matches the text description. The model consists of three sub-networks: text feature extraction model, text feature-to-video latent space diffusion model, and video latent space to video visual space model. It supports English input only and has a wide range of applications.", "model_name": "damo-vilab/text-to-video-ms-1.7b"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "google/ncsnpp-ffhq-256", "api_call": "DiffusionPipeline.from_pretrained('google/ncsnpp-ffhq-256')", "performance": {"dataset": "CIFAR-10", "accuracy": {"Inception score": 9.89, "FID": 2.2, "Likelihood": 2.99}}, "description": "Score-Based Generative Modeling through Stochastic Differential Equations (SDE) for unconditional image generation. Achieves record-breaking performance on CIFAR-10 and demonstrates high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.", "model_name": "google/ncsnpp-ffhq-256"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "google/bigbird-pegasus-large-bigpatent", "api_call": "BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-bigpatent')", "performance": {"dataset": "big_patent", "accuracy": "Not provided"}, "description": "BigBird, a sparse-attention based transformer, extends Transformer-based models like BERT to much longer sequences. It can handle sequences up to a length of 4096 at a much lower compute cost compared to BERT. BigBird has achieved state-of-the-art results on various tasks involving very long sequences such as long documents summarization and question-answering with long contexts.", "model_name": "google/bigbird-pegasus-large-bigpatent"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli", "api_call": "pipeline('zero-shot-classification', model='MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli')", "performance": {"dataset": [{"name": "mnli_test_m", "accuracy": 0.912}, {"name": "mnli_test_mm", "accuracy": 0.908}, {"name": "anli_test", "accuracy": 0.7020000000000001}, {"name": "anli_test_r3", "accuracy": 0.64}, {"name": "ling_test", "accuracy": 0.87}, {"name": "wanli_test", "accuracy": 0.77}]}, "description": "This model was fine-tuned on the MultiNLI, Fever-NLI, Adversarial-NLI (ANLI), LingNLI and WANLI datasets, which comprise 885 242 NLI hypothesis-premise pairs. This model is the best performing NLI model on the Hugging Face Hub as of 06.06.22 and can be used for zero-shot classification. It significantly outperforms all other large models on the ANLI benchmark.", "model_name": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "ToddGoldfarb/Cadet-Tiny", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('ToddGoldfarb/Cadet-Tiny', low_cpu_mem_usage=True)", "performance": {"dataset": "allenai/soda", "accuracy": ""}, "description": "Cadet-Tiny is a very small conversational model trained off of the SODA dataset. Cadet-Tiny is intended for inference at the edge (on something as small as a 2GB RAM Raspberry Pi). Cadet-Tiny is trained off of the t5-small pretrained model from Google, and is, as a result, is about 2% of the size of the Cosmo-3B model.", "model_name": "ToddGoldfarb/Cadet-Tiny"}
{"domain": "Audio Text-to-Speech", "framework": "SpeechBrain", "functionality": "Text-to-Speech", "api_name": "tts-hifigan-ljspeech", "api_call": "HIFIGAN.from_hparams(source='speechbrain/tts-hifigan-ljspeech', savedir=tmpdir)", "performance": {"dataset": "LJSpeech", "accuracy": "Not specified"}, "description": "This repository provides all the necessary tools for using a HiFIGAN vocoder trained with LJSpeech. The pre-trained model takes in input a spectrogram and produces a waveform in output. Typically, a vocoder is used after a TTS model that converts an input text into a spectrogram. The sampling frequency is 22050 Hz.", "model_name": "speechbrain/tts-hifigan-ljspeech"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "bert-large-uncased-whole-word-masking-finetuned-squad", "api_call": "AutoModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')", "performance": {"dataset": "SQuAD", "accuracy": {"f1": 93.15, "exact_match": 86.91}}, "description": "BERT large model (uncased) whole word masking finetuned on SQuAD. The model was pretrained on BookCorpus and English Wikipedia. It was trained with two objectives: Masked language modeling (MLM) and Next sentence prediction (NSP). This model should be used as a question-answering model.", "model_name": "bert-large-uncased-whole-word-masking-finetuned-squad"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-large-finetuned-sqa", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-large-finetuned-sqa')", "performance": {"dataset": "msr_sqa", "accuracy": 0.7289}, "description": "TAPAS large model fine-tuned on Sequential Question Answering (SQA). This model was pre-trained on MLM and an additional step which the authors call intermediate pre-training, and then fine-tuned on SQA. It uses relative position embeddings (i.e. resetting the position index at every cell of the table).", "model_name": "google/tapas-large-finetuned-sqa"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "sd-class-butterflies-32", "api_call": "DDPMPipeline.from_pretrained('clp/sd-class-butterflies-32')", "performance": {"dataset": null, "accuracy": null}, "description": "This model is a diffusion model for unconditional image generation of cute butterflies.", "model_name": "clp/sd-class-butterflies-32"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Cross-Encoder for Natural Language Inference", "api_name": "cross-encoder/nli-distilroberta-base", "api_call": "CrossEncoder('cross-encoder/nli-distilroberta-base')", "performance": {"dataset": "SNLI and MultiNLI", "accuracy": "See SBERT.net - Pretrained Cross-Encoder for evaluation results"}, "description": "This model was trained using SentenceTransformers Cross-Encoder class on the SNLI and MultiNLI datasets. For a given sentence pair, it will output three scores corresponding to the labels: contradiction, entailment, neutral.", "model_name": "cross-encoder/nli-distilroberta-base"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-base-printed", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')", "performance": {"dataset": "SROIE", "accuracy": "Not provided"}, "description": "TrOCR model fine-tuned on the SROIE dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of BEiT, while the text decoder was initialized from the weights of RoBERTa.", "model_name": "microsoft/trocr-base-printed"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "speech-enhancement", "api_name": "speechbrain/metricgan-plus-voicebank", "api_call": "SpectralMaskEnhancement.from_hparams(source='speechbrain/metricgan-plus-voicebank', savedir='pretrained_models/metricgan-plus-voicebank')", "performance": {"dataset": "Voicebank", "accuracy": {"Test PESQ": "3.15", "Test STOI": "93.0"}}, "description": "MetricGAN-trained model for Enhancement", "model_name": "speechbrain/metricgan-plus-voicebank"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Token Classification", "api_name": "dbmdz/bert-large-cased-finetuned-conll03-english", "api_call": "AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')", "performance": {"dataset": "CoNLL-03", "accuracy": "Not provided"}, "description": "This is a BERT-large-cased model fine-tuned on the CoNLL-03 dataset for token classification tasks.", "model_name": "dbmdz/bert-large-cased-finetuned-conll03-english"}
{"domain": "Audio Audio Classification", "framework": "PyTorch Transformers", "functionality": "Emotion Recognition", "api_name": "superb/wav2vec2-base-superb-er", "api_call": "pipeline('audio-classification', model='superb/wav2vec2-base-superb-er')", "performance": {"dataset": "IEMOCAP", "accuracy": 0.6258}, "description": "This is a ported version of S3PRL's Wav2Vec2 for the SUPERB Emotion Recognition task. The base model is wav2vec2-base, which is pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. For more information refer to SUPERB: Speech processing Universal PERformance Benchmark.", "model_name": "superb/wav2vec2-base-superb-er"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-base-finetuned-k600", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-base-finetuned-k600')", "performance": {"dataset": "Kinetics-600", "accuracy": null}, "description": "TimeSformer model pre-trained on Kinetics-600. It was introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al. and first released in this repository.", "model_name": "facebook/timesformer-base-finetuned-k600"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "sup-simcse-roberta-large", "api_call": "AutoModel.from_pretrained('princeton-nlp/sup-simcse-roberta-large')", "performance": {"dataset": "STS tasks", "accuracy": "Spearman's correlation (See associated paper Appendix B)"}, "description": "A pretrained RoBERTa-large model for simple contrastive learning of sentence embeddings. It can be used for feature extraction and has been evaluated on semantic textual similarity (STS) tasks and downstream transfer tasks.", "model_name": "princeton-nlp/sup-simcse-roberta-large"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Image Upscaling", "api_name": "stabilityai/sd-x2-latent-upscaler", "api_call": "StableDiffusionLatentUpscalePipeline.from_pretrained(stabilityai/sd-x2-latent-upscaler, torch_dtype=torch.float16)", "performance": {"dataset": "LAION-2B", "accuracy": "Not specified"}, "description": "Stable Diffusion x2 latent upscaler is a diffusion-based upscaler model developed by Katherine Crowson in collaboration with Stability AI. It is designed to upscale Stable Diffusion's latent denoised image embeddings, allowing for fast text-to-image and upscaling pipelines. The model was trained on a high-resolution subset of the LAION-2B dataset and works with all Stable Diffusion checkpoints.", "model_name": "stabilityai/sd-x2-latent-upscaler"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", "api_call": "pipeline('image-classification', model='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k')", "performance": {"dataset": "ImageNet-1k", "accuracy": "80.1"}, "description": "A CLIP ViT-bigG/14 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP. The model is intended for research purposes and enables researchers to better understand and explore zero-shot, arbitrary image classification. It can be used for interdisciplinary studies of the potential impact of such models. The model achieves a 80.1 zero-shot top-1 accuracy on ImageNet-1k.", "model_name": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text Summarization", "api_name": "distilbart-cnn-12-6-samsum", "api_call": "pipeline('summarization', model='philschmid/distilbart-cnn-12-6-samsum')", "performance": {"dataset": "samsum", "accuracy": {"ROUGE-1": 41.09, "ROUGE-2": 20.746, "ROUGE-L": 31.595, "ROUGE-LSUM": 38.339}}, "description": "This model is a DistilBART-based text summarization model trained on the SAMsum dataset. It can be used to generate summaries of conversational text.", "model_name": "philschmid/distilbart-cnn-12-6-samsum"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "hubert-large-ll60k", "api_call": "HubertModel.from_pretrained('facebook/hubert-large-ll60k')", "performance": {"dataset": "Libri-Light", "accuracy": "matches or improves upon the state-of-the-art wav2vec 2.0 performance"}, "description": "Hubert-Large is a self-supervised speech representation learning model pretrained on 16kHz sampled speech audio. It is designed to deal with the unique problems in speech representation learning, such as multiple sound units in each input utterance, no lexicon of input sound units during the pre-training phase, and variable lengths of sound units with no explicit segmentation. The model relies on an offline clustering step to provide aligned target labels for a BERT-like prediction loss.", "model_name": "facebook/hubert-large-ll60k"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/deberta-v3-base-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/deberta-v3-base-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"Exact Match": 83.825, "F1": 87.41}}, "description": "This is the deberta-v3-base model, fine-tuned using the SQuAD2.0 dataset. It's been trained on question-answer pairs, including unanswerable questions, for the task of Question Answering.", "model_name": "deepset/deberta-v3-base-squad2"}
{"domain": "Audio Text-to-Speech", "framework": "SpeechBrain", "functionality": "Text-to-Speech", "api_name": "speechbrain/tts-tacotron2-ljspeech", "api_call": "Tacotron2.from_hparams(source='speechbrain/tts-tacotron2-ljspeech')", "performance": {"dataset": "LJSpeech", "accuracy": "Not specified"}, "description": "This repository provides all the necessary tools for Text-to-Speech (TTS) with SpeechBrain using a Tacotron2 pretrained on LJSpeech. The pre-trained model takes in input a short text and produces a spectrogram in output. One can get the final waveform by applying a vocoder (e.g., HiFIGAN) on top of the generated spectrogram.", "model_name": "speechbrain/tts-tacotron2-ljspeech"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "superb/wav2vec2-base-superb-ks", "api_call": "pipeline('audio-classification', model='superb/wav2vec2-base-superb-ks')", "performance": {"dataset": "Speech Commands dataset v1.0", "accuracy": {"s3prl": 0.9623, "transformers": 0.9643}}, "description": "Wav2Vec2-Base for Keyword Spotting (KS) task in the SUPERB benchmark. The base model is pretrained on 16kHz sampled speech audio. The KS task detects preregistered keywords by classifying utterances into a predefined set of words. The model is trained on the Speech Commands dataset v1.0.", "model_name": "superb/wav2vec2-base-superb-ks"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "google/t5-v1_1-base", "api_call": "pipeline('text2text-generation', model='google/t5-v1_1-base')", "performance": {"dataset": "c4", "accuracy": "Not provided"}, "description": "Google's T5 Version 1.1 is a state-of-the-art text-to-text transformer model that achieves high performance on various NLP tasks such as summarization, question answering, and text classification. It is pre-trained on the Colossal Clean Crawled Corpus (C4) and fine-tuned on downstream tasks.", "model_name": "google/t5-v1_1-base"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221215-092352", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221215-092352')", "performance": {"dataset": "DIODE", "accuracy": ""}, "description": "A depth estimation model fine-tuned on the DIODE dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221215-092352"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k", "api_call": "clip.load('timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "This model is a zero-shot image classification model based on OpenCLIP. It can be used for classifying images into various categories without any additional training.", "model_name": "timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "api_call": "Wav2Vec2Model.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-arabic')", "performance": {"dataset": "Common Voice ar", "accuracy": {"WER": 39.59, "CER": 18.18}}, "description": "Fine-tuned XLSR-53 large model for speech recognition in Arabic. Fine-tuned facebook/wav2vec2-large-xlsr-53 on Arabic using the train and validation splits of Common Voice 6.1 and Arabic Speech Corpus.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "results-yelp", "api_call": "AutoTokenizer.from_pretrained('bert-base-uncased')", "performance": {"dataset": "Yelp", "accuracy": 0.9302}, "description": "This model is a fine-tuned version of textattack/bert-base-uncased-yelp-polarity on a filtered and manually reviewed Yelp dataset containing restaurant reviews only. It is intended to perform text classification, specifically sentiment analysis, on text data obtained from restaurant reviews to determine if the particular review is positive or negative.", "model_name": "bert-base-uncased"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "deep-reinforcement-learning", "api_name": "ppo-BreakoutNoFrameskip-v4", "api_call": "load_from_hub(repo_id='sb3/ppo-BreakoutNoFrameskip-v4',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "BreakoutNoFrameskip-v4", "accuracy": "398.00 +/- 16.30"}, "description": "This is a trained model of a PPO agent playing BreakoutNoFrameskip-v4 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/ppo-BreakoutNoFrameskip-v4"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Semantic Segmentation", "api_name": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024", "api_call": "SegformerForSemanticSegmentation.from_pretrained('nvidia/segformer-b2-finetuned-cityscapes-1024-1024')", "performance": {"dataset": "Cityscapes", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on CityScapes at resolution 1024x1024. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository.", "model_name": "nvidia/segformer-b2-finetuned-cityscapes-1024-1024"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k", "api_call": "pipeline('image-classification', model='timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k', framework='pt')", "performance": {"dataset": "", "accuracy": ""}, "description": "A ViT-based image classification model trained on ImageNet-1K and fine-tuned on ImageNet-12K by OpenAI.", "model_name": "timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "DialoGPT-large", "api_call": "AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')", "performance": {"dataset": "Reddit discussion thread", "accuracy": "Comparable to human response quality under a single-turn conversation Turing test"}, "description": "DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. The human evaluation results indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test. The model is trained on 147M multi-turn dialogue from Reddit discussion thread.", "model_name": "microsoft/DialoGPT-large"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transcription and Translation", "api_name": "openai/whisper-medium", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-medium')", "performance": {"dataset": [{"name": "LibriSpeech (clean)", "accuracy": 2.9}, {"name": "LibriSpeech (other)", "accuracy": 5.9}, {"name": "Common Voice 11.0", "accuracy": 53.87}]}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. It is a Transformer-based encoder-decoder model and was trained on either English-only data or multilingual data.", "model_name": "openai/whisper-medium"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Text2Text Generation", "api_name": "castorini/doc2query-t5-base-msmarco", "api_call": "T5ForConditionalGeneration.from_pretrained('castorini/doc2query-t5-base-msmarco')", "performance": {"dataset": "MS MARCO", "accuracy": "Not specified"}, "description": "A T5 model trained on the MS MARCO dataset for generating queries from documents.", "model_name": "castorini/doc2query-t5-base-msmarco"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "git-large-r-textcaps", "api_call": "pipeline('text-generation', model='microsoft/git-large-r-textcaps')", "performance": {"dataset": "TextCaps", "accuracy": ""}, "description": "GIT (short for GenerativeImage2Text) model, large-sized version, fine-tuned on TextCaps. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is trained using 'teacher forcing' on a lot of (image, text) pairs. The goal for the model is simply to predict the next text token, giving the image tokens and previous text tokens. This allows the model to be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification (by simply conditioning the model on the image and asking it to generate a class for it in text).", "model_name": "microsoft/git-large-r-textcaps"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-base-finetuned-wikisql-supervised", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wikisql-supervised')", "performance": {"dataset": "wikisql", "accuracy": "Not provided"}, "description": "TAPAS is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion. It was pretrained with two objectives: Masked language modeling (MLM) and Intermediate pre-training. Fine-tuning is done by adding a cell selection head and aggregation head on top of the pre-trained model, and then jointly train these randomly initialized classification heads with the base model on SQA and WikiSQL.", "model_name": "google/tapas-base-finetuned-wikisql-supervised"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221122-082237", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221122-082237')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.3421, "Mae": 0.27, "Rmse": 0.4042, "Abs Rel": 0.3279, "Log Mae": 0.11320000000000001, "Log Rmse": 0.1688, "Delta1": 0.5839, "Delta2": 0.8408, "Delta3": 0.9309000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset. It is used for depth estimation tasks.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221122-082237"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/table-transformer-detection", "api_call": "TableTransformerDetrModel.from_pretrained('microsoft/table-transformer-detection')", "performance": {"dataset": "PubTables1M", "accuracy": "Not provided"}, "description": "Table Transformer (DETR) model trained on PubTables1M for detecting tables in documents. Introduced in the paper PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents by Smock et al.", "model_name": "microsoft/table-transformer-detection"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "fcakyon/timesformer-large-finetuned-k400", "api_call": "TimesformerForVideoClassification.from_pretrained('fcakyon/timesformer-large-finetuned-k400')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not provided"}, "description": "TimeSformer model pre-trained on Kinetics-400 for video classification into one of the 400 possible Kinetics-400 labels. Introduced in the paper 'TimeSformer: Is Space-Time Attention All You Need for Video Understanding?' by Tong et al.", "model_name": "fcakyon/timesformer-large-finetuned-k400"}
{"domain": "Tabular Tabular Regression", "framework": "Hugging Face", "functionality": "Predicting Pokemon HP", "api_name": "julien-c/pokemon-predict-hp", "api_call": "pipeline('regression', model='julien-c/pokemon-predict-hp')", "performance": {"dataset": "julien-c/kaggle-rounakbanik-pokemon", "accuracy": {"mean_absolute_error": 15.909, "model_loss": 647.605}}, "description": "A tabular regression model trained on the julien-c/kaggle-rounakbanik-pokemon dataset to predict the HP of Pokemon.", "model_name": "julien-c/pokemon-predict-hp"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Semantic Segmentation", "api_name": "nvidia/segformer-b0-finetuned-ade-512-512", "api_call": "SegformerForSemanticSegmentation.from_pretrained('nvidia/segformer-b0-finetuned-ade-512-512')", "performance": {"dataset": "ADE20k", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on ADE20k at resolution 512x512. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository.", "model_name": "nvidia/segformer-b0-finetuned-ade-512-512"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image Inpainting", "api_name": "lllyasviel/control_v11p_sd15_inpaint", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_inpaint')", "performance": {"dataset": "Stable Diffusion v1-5", "accuracy": "Not specified"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on inpaint images.", "model_name": "lllyasviel/control_v11p_sd15_inpaint"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "impira/layoutlm-invoices", "api_call": "pipeline('question-answering', model='impira/layoutlm-invoices')", "performance": {"dataset": "proprietary dataset of invoices, SQuAD2.0, and DocVQA", "accuracy": "not provided"}, "description": "This is a fine-tuned version of the multi-modal LayoutLM model for the task of question answering on invoices and other documents. It has been fine-tuned on a proprietary dataset of invoices as well as both SQuAD2.0 and DocVQA for general comprehension. Unlike other QA models, which can only extract consecutive tokens (because they predict the start and end of a sequence), this model can predict longer-range, non-consecutive sequences with an additional classifier head.", "model_name": "impira/layoutlm-invoices"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "facebook/detr-resnet-101", "api_call": "DetrForObjectDetection.from_pretrained('facebook/detr-resnet-101')", "performance": {"dataset": "COCO 2017", "accuracy": "43.5 AP"}, "description": "DEtection TRansformer (DETR) model trained end-to-end on COCO 2017 object detection (118k annotated images). It was introduced in the paper End-to-End Object Detection with Transformers by Carion et al. and first released in this repository.", "model_name": "facebook/detr-resnet-101"}
{"domain": "Multimodal Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", "api_call": "pipeline('zero-shot-image-classification', model='microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')", "performance": {"dataset": "PMC-15M", "accuracy": "State of the art"}, "description": "BiomedCLIP is a biomedical vision-language foundation model pretrained on PMC-15M, a dataset of 15 million figure-caption pairs extracted from biomedical research articles in PubMed Central, using contrastive learning. It uses PubMedBERT as the text encoder and Vision Transformer as the image encoder, with domain-specific adaptations. It can perform various vision-language processing (VLP) tasks such as cross-modal retrieval, image classification, and visual question answering.", "model_name": "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary", "api_call": "AutoModelForSequenceClassification.from_pretrained('MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary')", "performance": {"dataset": {"mnli-m-2c": {"accuracy": 0.925}, "mnli-mm-2c": {"accuracy": 0.922}, "fever-nli-2c": {"accuracy": 0.892}, "anli-all-2c": {"accuracy": 0.676}, "anli-r3-2c": {"accuracy": 0.665}, "lingnli-2c": {"accuracy": 0.888}}}, "description": "This model was trained on 782 357 hypothesis-premise pairs from 4 NLI datasets: MultiNLI, Fever-NLI, LingNLI and ANLI. The base model is DeBERTa-v3-xsmall from Microsoft. The v3 variant of DeBERTa substantially outperforms previous versions of the model by including a different pre-training objective.", "model_name": "MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10)", "performance": {"dataset": "covost2", "accuracy": ""}, "description": "A text-to-speech model trained on mtedx, covost2, europarl_st, and voxpopuli datasets for English, French, Spanish, and Italian languages. Licensed under cc-by-nc-4.0.", "model_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "GroNLP/bert-base-dutch-cased", "api_call": "AutoModel.from_pretrained('GroNLP/bert-base-dutch-cased')", "performance": {"dataset": [{"name": "CoNLL-2002", "accuracy": "90.24"}, {"name": "SoNaR-1", "accuracy": "84.93"}, {"name": "spaCy UD LassySmall", "accuracy": "86.10"}]}, "description": "BERTje is a Dutch pre-trained BERT model developed at the University of Groningen.", "model_name": "GroNLP/bert-base-dutch-cased"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "thatdramebaazguy/roberta-base-squad", "api_call": "pipeline(task='question-answering',model='thatdramebaazguy/roberta-base-squad')", "performance": {"dataset": [{"name": "SQuADv1", "accuracy": {"exact_match": 83.6045, "f1": 91.1709}}, {"name": "MoviesQA", "accuracy": {"exact_match": 51.6494, "f1": 68.2615}}]}, "description": "This is Roberta Base trained to do the SQuAD Task. This makes a QA model capable of answering questions.", "model_name": "thatdramebaazguy/roberta-base-squad"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Automatic Speech Recognition and Speech Translation", "api_name": "openai/whisper-base", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-base')", "performance": {"dataset": "LibriSpeech (clean) test set", "accuracy": "5.009 WER"}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalize to many datasets and domains without the need for fine-tuning.", "model_name": "openai/whisper-base"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Audio Spectrogram", "api_name": "audio-spectrogram-transformer", "api_call": "ASTModel.from_pretrained('MIT/ast-finetuned-audioset-10-10-0.4593')", "performance": {"dataset": "", "accuracy": ""}, "description": "One custom ast model for testing of HF repos", "model_name": "MIT/ast-finetuned-audioset-10-10-0.4593"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "blip2-flan-t5-xl", "api_call": "Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-flan-t5-xl')", "performance": {"dataset": "LAION", "accuracy": "Not provided"}, "description": "BLIP-2 model, leveraging Flan T5-xl (a large language model). It was introduced in the paper BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models by Li et al. and first released in this repository. The goal for the model is to predict the next text token, giving the query embeddings and the previous text. This allows the model to be used for tasks like image captioning, visual question answering (VQA), and chat-like conversations by feeding the image and the previous conversation as prompt to the model.", "model_name": "Salesforce/blip2-flan-t5-xl"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221228-072509", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221228-072509')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.4012, "Mae": 0.403, "Rmse": 0.6173000000000001, "Abs Rel": 0.3487, "Log Mae": 0.1574, "Log Rmse": 0.211, "Delta1": 0.4308, "Delta2": 0.6997, "Delta3": 0.8249000000000001}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221228-072509"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "EimisAnimeDiffusion_1.0v", "api_call": "DiffusionPipeline.from_pretrained('eimiss/EimisAnimeDiffusion_1.0v')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "EimisAnimeDiffusion_1.0v is a text-to-image model trained with high-quality and detailed anime images. It works well on anime and landscape generations and supports a Gradio Web UI.", "model_name": "eimiss/EimisAnimeDiffusion_1.0v"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-fi-en", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-fi-en')", "performance": {"dataset": [{"name": "newsdev2015-enfi-fineng.fin.eng", "accuracy": "BLEU: 25.3, chr-F: 0.536"}, {"name": "newstest2015-enfi-fineng.fin.eng", "accuracy": "BLEU: 26.9, chr-F: 0.547"}, {"name": "newstest2016-enfi-fineng.fin.eng", "accuracy": "BLEU: 29.0, chr-F: 0.571"}, {"name": "newstest2017-enfi-fineng.fin.eng", "accuracy": "BLEU: 32.3, chr-F: 0.594"}, {"name": "newstest2018-enfi-fineng.fin.eng", "accuracy": "BLEU: 23.8, chr-F: 0.517"}, {"name": "newstest2019-fien-fineng.fin.eng", "accuracy": "BLEU: 29.0, chr-F: 0.565"}, {"name": "newstestB2016-enfi-fineng.fin.eng", "accuracy": "BLEU: 24.5, chr-F: 0.527"}, {"name": "newstestB2017-enfi-fineng.fin.eng", "accuracy": "BLEU: 27.4, chr-F: 0.557"}, {"name": "newstestB2017-fien-fineng.fin.eng", "accuracy": "BLEU: 27.4, chr-F: 0.557"}, {"name": "Tatoeba-test.fin.eng", "accuracy": "BLEU: 53.4, chr-F: 0.697"}]}, "description": "Helsinki-NLP/opus-mt-fi-en is a machine translation model for translating Finnish text to English text. It is trained on the OPUS dataset and can be used with the Hugging Face Transformers library.", "model_name": "Helsinki-NLP/opus-mt-fi-en"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "abhishek/autotrain-dog-vs-food", "api_call": "pipeline('image-classification', model='abhishek/autotrain-dog-vs-food')", "performance": {"dataset": "sasha/dog-food", "accuracy": 0.998}, "description": "A pre-trained model for classifying images as either dog or food using Hugging Face's AutoTrain framework.", "model_name": "abhishek/autotrain-dog-vs-food"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "microsoft/GODEL-v1_1-large-seq2seq", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('microsoft/GODEL-v1_1-large-seq2seq')", "performance": {"dataset": "Reddit discussion thread, instruction and knowledge grounded dialogs", "accuracy": "Not provided"}, "description": "GODEL is a large-scale pre-trained model for goal-directed dialogs. It is parameterized with a Transformer-based encoder-decoder model and trained for response generation grounded in external text, which allows more effective fine-tuning on dialog tasks that require conditioning the response on information that is external to the current conversation (e.g., a retrieved document). The pre-trained model can be efficiently fine-tuned and adapted to accomplish a new dialog task with a handful of task-specific dialogs. The v1.1 model is trained on 551M multi-turn dialogs from Reddit discussion thread, and 5M instruction and knowledge grounded dialogs.", "model_name": "microsoft/GODEL-v1_1-large-seq2seq"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024", "api_call": "SegformerForSemanticSegmentation.from_pretrained('nvidia/segformer-b0-finetuned-cityscapes-1024-1024')", "performance": {"dataset": "CityScapes", "accuracy": "Not provided"}, "description": "SegFormer model fine-tuned on CityScapes at resolution 1024x1024. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository.", "model_name": "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "lysandre/tiny-vit-random", "api_call": "ViTForImageClassification.from_pretrained('lysandre/tiny-vit-random')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny-vit-random model for image classification using Hugging Face Transformers.", "model_name": "lysandre/tiny-vit-random"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "promptcap-coco-vqa", "api_call": "PromptCap('vqascore/promptcap-coco-vqa')", "performance": {"dataset": {"coco": {"accuracy": "150 CIDEr"}, "OK-VQA": {"accuracy": "60.4%"}, "A-OKVQA": {"accuracy": "59.6%"}}}, "description": "PromptCap is a captioning model that can be controlled by natural language instruction. The instruction may contain a question that the user is interested in. It achieves SOTA performance on COCO captioning (150 CIDEr) and knowledge-based VQA tasks when paired with GPT-3 (60.4% on OK-VQA and 59.6% on A-OKVQA).", "model_name": "vqascore/promptcap-coco-vqa"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "kan-bayashi_csmsc_tts_train_tacotron2_raw_phn_pypinyin_g2p_phone_train.loss.best", "api_call": "Text2Speech.from_pretrained('espnet/kan-bayashi_csmsc_tts_train_tacotron2_raw_phn_pypinyin_g2p_phone_train.loss.best')", "performance": {"dataset": "csmsc", "accuracy": "Not specified"}, "description": "A pre-trained Text-to-Speech model for Chinese language using ESPnet framework. It can be used to convert text input into speech output in Chinese.", "model_name": "espnet/kan-bayashi_csmsc_tts_train_tacotron2_raw_phn_pypinyin_g2p_phone_train.loss.best"}
{"domain": "Audio Text-to-Speech", "framework": "ONNX", "functionality": "Text-to-Speech", "api_name": "NeuML/ljspeech-jets-onnx", "api_call": "TextToSpeech(NeuML/ljspeech-jets-onnx)", "performance": {"dataset": "ljspeech", "accuracy": null}, "description": "ESPnet JETS Text-to-Speech (TTS) Model for ONNX exported using the espnet_onnx library. Can be used with txtai pipeline or directly with ONNX.", "model_name": "NeuML/ljspeech-jets-onnx"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "martin-ha/toxic-comment-model", "api_call": "pipeline(model='martin-ha/toxic-comment-model')", "performance": {"dataset": "held-out test set", "accuracy": 0.9400000000000001, "f1-score": 0.59}, "description": "This model is a fine-tuned version of the DistilBERT model to classify toxic comments.", "model_name": "martin-ha/toxic-comment-model"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "zero-shot-object-detection", "api_name": "google/owlvit-base-patch16", "api_call": "OwlViTForObjectDetection.from_pretrained('google/owlvit-base-patch16')", "performance": {"dataset": "COCO", "accuracy": "Not provided"}, "description": "OWL-ViT is a zero-shot text-conditioned object detection model that can be used to query an image with one or multiple text queries. OWL-ViT uses CLIP as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features.", "model_name": "google/owlvit-base-patch16"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Text Classification", "api_name": "typeform/distilbert-base-uncased-mnli", "api_call": "AutoModelForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli')", "performance": {"dataset": "multi_nli", "accuracy": 0.8206875509}, "description": "This is the uncased DistilBERT model fine-tuned on Multi-Genre Natural Language Inference (MNLI) dataset for the zero-shot classification task.", "model_name": "typeform/distilbert-base-uncased-mnli"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Conversational", "api_name": "microsoft/DialoGPT-large", "api_call": "AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')", "performance": {"dataset": "Reddit discussion thread", "accuracy": "Comparable to human response quality under a single-turn conversation Turing test"}, "description": "DialoGPT is a state-of-the-art large-scale pretrained dialogue response generation model for multi-turn conversations. The model is trained on 147M multi-turn dialogues from Reddit discussion threads.", "model_name": "microsoft/DialoGPT-large"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "tts_transformer-ar-cv7", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/tts_transformer-ar-cv7')", "performance": {"dataset": "common_voice", "accuracy": "Not specified"}, "description": "Transformer text-to-speech model for Arabic language with a single-speaker male voice, trained on Common Voice v7 dataset.", "model_name": "facebook/tts_transformer-ar-cv7"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-de-en", "api_call": "translation_pipeline('translation_de_to_en', model='Helsinki-NLP/opus-mt-de-en')", "performance": {"dataset": "opus", "accuracy": {"newssyscomb2009.de.en": 29.4, "news-test2008.de.en": 27.8, "newstest2009.de.en": 26.8, "newstest2010.de.en": 30.2, "newstest2011.de.en": 27.4, "newstest2012.de.en": 29.1, "newstest2013.de.en": 32.1, "newstest2014-deen.de.en": 34.0, "newstest2015-ende.de.en": 34.2, "newstest2016-ende.de.en": 40.4, "newstest2017-ende.de.en": 35.7, "newstest2018-ende.de.en": 43.7, "newstest2019-deen.de.en": 40.1, "Tatoeba.de.en": 55.4}}, "description": "A German to English translation model trained on the OPUS dataset using the Hugging Face Transformers library.", "model_name": "Helsinki-NLP/opus-mt-de-en"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Language model", "api_name": "google/flan-t5-small", "api_call": "T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')", "performance": {"dataset": [{"name": "MMLU", "accuracy": "75.2%"}]}, "description": "FLAN-T5 small is a fine-tuned version of T5 language model on more than 1000 additional tasks covering multiple languages. It achieves state-of-the-art performance on several benchmarks, such as 75.2% on five-shot MMLU. The model is designed for research on language models, including zero-shot and few-shot NLP tasks, reasoning, question answering, fairness, and safety research. It has not been tested in real-world applications and should not be used directly in any application without prior assessment of safety and fairness concerns specific to the application.", "model_name": "google/flan-t5-small"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/swinv2-tiny-patch4-window8-256", "api_call": "AutoModelForImageClassification.from_pretrained('microsoft/swinv2-tiny-patch4-window8-256')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "Swin Transformer v2 model pre-trained on ImageNet-1k at resolution 256x256. It was introduced in the paper Swin Transformer V2: Scaling Up Capacity and Resolution by Liu et al. and first released in this repository. The Swin Transformer is a type of Vision Transformer. It builds hierarchical feature maps by merging image patches in deeper layers and has linear computation complexity to input image size due to computation of self-attention only within each local window. Swin Transformer v2 adds 3 main improvements: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) a log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) a self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images.", "model_name": "microsoft/swinv2-tiny-patch4-window8-256"}
{"domain": "Tabular Tabular Classification", "framework": "Scikit-learn", "functionality": "Wine Quality classification", "api_name": "osanseviero/wine-quality", "api_call": "joblib.load(cached_download(hf_hub_url('julien-c/wine-quality', 'sklearn_model.joblib')))", "performance": {"dataset": "winequality-red.csv", "accuracy": 0.6616635397}, "description": "A Simple Example of Scikit-learn Pipeline for Wine Quality classification. Inspired by https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976 by Saptashwa Bhattacharyya.", "model_name": "julien-c/wine-quality"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "google/ddpm-cat-256", "api_call": "DDPMPipeline.from_pretrained('google/ddpm-cat-256')", "performance": {"dataset": "CIFAR10", "accuracy": {"Inception_score": 9.46, "FID_score": 3.17}}, "description": "Denoising Diffusion Probabilistic Models (DDPM) is a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. It can generate high-quality images using discrete noise schedulers such as scheduling_ddpm, scheduling_ddim, and scheduling_pndm. The model is trained on the unconditional CIFAR10 dataset and 256x256 LSUN, obtaining an Inception score of 9.46 and a state-of-the-art FID score of 3.17.", "model_name": "google/ddpm-cat-256"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "ControlNet - M-LSD Straight Line Version", "api_name": "lllyasviel/sd-controlnet-mlsd", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-mlsd')", "performance": {"dataset": "600k edge-image, caption pairs generated from Places2", "accuracy": "Not specified"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on M-LSD straight line detection. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-mlsd"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7", "api_call": "AutoModelForSequenceClassification.from_pretrained('MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary')", "performance": {"dataset": [{"name": "MultiNLI-matched", "accuracy": 0.857}, {"name": "MultiNLI-mismatched", "accuracy": 0.856}, {"name": "ANLI-all", "accuracy": 0.537}, {"name": "ANLI-r3", "accuracy": 0.497}, {"name": "WANLI", "accuracy": 0.732}, {"name": "LingNLI", "accuracy": 0.788}, {"name": "fever-nli", "accuracy": 0.761}]}, "description": "This multilingual model can perform natural language inference (NLI) on 100 languages and is therefore also suitable for multilingual zero-shot classification. The underlying mDeBERTa-v3-base model was pre-trained by Microsoft on the CC100 multilingual dataset with 100 languages. The model was then fine-tuned on the XNLI dataset and on the multilingual-NLI-26lang-2mil7 dataset. Both datasets contain more than 2.7 million hypothesis-premise pairs in 27 languages spoken by more than 4 billion people.", "model_name": "MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "LunarLander-v2", "api_name": "araffin/ppo-LunarLander-v2", "api_call": "PPO.load_from_hub('araffin/ppo-LunarLander-v2', 'ppo-LunarLander-v2.zip')", "performance": {"dataset": "LunarLander-v2", "accuracy": "283.49 +/- 13.74"}, "description": "This is a trained model of a PPO agent playing LunarLander-v2 using the stable-baselines3 library.", "model_name": "araffin/ppo-LunarLander-v2"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "distilbert-base-multilingual-cased-ner-hrl", "api_call": "AutoModelForTokenClassification.from_pretrained('Davlan/distilbert-base-multilingual-cased-ner-hrl')", "performance": {"dataset": [{"name": "ANERcorp", "language": "Arabic"}, {"name": "conll 2003", "language": "German"}, {"name": "conll 2003", "language": "English"}, {"name": "conll 2002", "language": "Spanish"}, {"name": "Europeana Newspapers", "language": "French"}, {"name": "Italian I-CAB", "language": "Italian"}, {"name": "Latvian NER", "language": "Latvian"}, {"name": "conll 2002", "language": "Dutch"}, {"name": "Paramopama + Second Harem", "language": "Portuguese"}, {"name": "MSRA", "language": "Chinese"}], "accuracy": "Not specified"}, "description": "distilbert-base-multilingual-cased-ner-hrl is a Named Entity Recognition model for 10 high resourced languages (Arabic, German, English, Spanish, French, Italian, Latvian, Dutch, Portuguese and Chinese) based on a fine-tuned Distiled BERT base model. It has been trained to recognize three types of entities: location (LOC), organizations (ORG), and person (PER).", "model_name": "Davlan/distilbert-base-multilingual-cased-ner-hrl"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "speech-to-speech-translation", "api_name": "xm_transformer_unity_en-hk", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/xm_transformer_unity_en-hk')", "performance": {"dataset": "MuST-C", "accuracy": null}, "description": "Speech-to-speech translation model with two-pass decoder (UnitY) from fairseq: English-Hokkien. Trained with supervised data in TED domain, and weakly supervised data in TED and Audiobook domain.", "model_name": "facebook/xm_transformer_unity_en-hk"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "openai/clip-vit-large-patch14-336", "api_call": "CLIPModel.from_pretrained('openai/clip-vit-large-patch14').", "performance": {"dataset": "unknown", "accuracy": "N/A"}, "description": "This model was trained from scratch on an unknown dataset.", "model_name": "openai/clip-vit-large-patch14"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "videomae-large", "api_call": "VideoMAEForPreTraining.from_pretrained('MCG-NJU/videomae-large')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not provided"}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches. Videos are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds fixed sinus/cosinus position embeddings before feeding the sequence to the layers of the Transformer encoder. By pre-training the model, it learns an inner representation of videos that can then be used to extract features useful for downstream tasks.", "model_name": "MCG-NJU/videomae-large"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-video synthesis", "api_name": "damo-vilab/text-to-video-ms-1.7b", "api_call": "DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b', torch_dtype=torch.float16, variant=fp16)", "performance": {"dataset": "Webvid, ImageNet, LAION5B", "accuracy": "N/A"}, "description": "This model is based on a multi-stage text-to-video generation diffusion model, which inputs a description text and returns a video that matches the text description. The model consists of three sub-networks: text feature extraction model, text feature-to-video latent space diffusion model, and video latent space to video visual space model. The overall model parameters are about 1.7 billion. Currently, it only supports English input.", "model_name": "damo-vilab/text-to-video-ms-1.7b"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Information Retrieval", "api_name": "cross-encoder/ms-marco-MiniLM-L-12-v2", "api_call": "AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')", "performance": {"dataset": {"TREC Deep Learning 2019": {"NDCG@10": 74.31}, "MS Marco Passage Reranking": {"MRR@10": 39.02, "accuracy": "960 Docs / Sec"}}}, "description": "This model was trained on the MS Marco Passage Ranking task. The model can be used for Information Retrieval: Given a query, encode the query will all possible passages (e.g. retrieved with ElasticSearch). Then sort the passages in a decreasing order. See SBERT.net Retrieve & Re-rank for more details. The training code is available here: SBERT.net Training MS Marco", "model_name": "cross-encoder/ms-marco-MiniLM-L-12-v2"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Speech-to-speech translation", "api_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur')", "performance": {"dataset": "covost2", "accuracy": null}, "description": "Speech-to-speech translation model from fairseq S2UT (paper/code) for Spanish-English. Trained on mTEDx, CoVoST 2, Europarl-ST, and VoxPopuli.", "model_name": "facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-russian", "api_call": "SpeechRecognitionModel('jonatasgrosman/wav2vec2-large-xlsr-53-russian')", "performance": {"dataset": "mozilla-foundation/common_voice_6_0", "accuracy": {"Test WER": 13.3, "Test CER": 2.88, "Test WER (+LM)": 9.57, "Test CER (+LM)": 2.24}}, "description": "Fine-tuned XLSR-53 large model for speech recognition in Russian. Fine-tuned facebook/wav2vec2-large-xlsr-53 on Russian using the train and validation splits of Common Voice 6.1 and CSS10.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-russian"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "videomae-small-finetuned-ssv2", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-small-finetuned-ssv2')", "performance": {"dataset": "Something-Something V2", "accuracy": {"top-1": 66.8, "top-5": 90.3}}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches. Videos are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds fixed sinus/cosinus position embeddings before feeding the sequence to the layers of the Transformer encoder. By pre-training the model, it learns an inner representation of videos that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled videos for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire video.", "model_name": "MCG-NJU/videomae-small-finetuned-ssv2"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "bert-large-cased-whole-word-masking-finetuned-squad", "api_call": "AutoModel.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')", "performance": {"dataset": [{"name": "BookCorpus", "accuracy": "N/A"}, {"name": "English Wikipedia", "accuracy": "N/A"}]}, "description": "BERT large model (cased) whole word masking finetuned on SQuAD. This model is cased and trained with a new technique: Whole Word Masking. After pre-training, this model was fine-tuned on the SQuAD dataset.", "model_name": "bert-large-cased-whole-word-masking-finetuned-squad"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-english", "api_call": "Wav2Vec2Model.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-english')", "performance": {"dataset": "mozilla-foundation/common_voice_6_0", "accuracy": {"Test WER": 19.06, "Test CER": 7.69, "Test WER (+LM)": 14.81, "Test CER (+LM)": 6.84}}, "description": "Fine-tuned facebook/wav2vec2-large-xlsr-53 on English using the train and validation splits of Common Voice 6.1. When using this model, make sure that your speech input is sampled at 16kHz.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-english"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Entity Extraction", "api_name": "903429548", "api_call": "AutoModelForTokenClassification.from_pretrained('ismail-lucifer011/autotrain-company_all-903429548', use_auth_token=True)", "performance": {"dataset": "ismail-lucifer011/autotrain-data-company_all", "accuracy": 0.9979930567}, "description": "A token classification model trained using AutoTrain for entity extraction. The model is based on the distilbert architecture and trained on the ismail-lucifer011/autotrain-data-company_all dataset. It can be used to identify and extract company names from text.", "model_name": "ismail-lucifer011/autotrain-company_all-903429548"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/git-large-vqav2", "api_call": "AutoModel.from_pretrained('microsoft/git-large-vqav2')", "performance": {"dataset": "VQAv2", "accuracy": "Refer to the paper"}, "description": "GIT (short for GenerativeImage2Text) model, large-sized version, fine-tuned on VQAv2. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is a Transformer decoder conditioned on both CLIP image tokens and text tokens. It can be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification.", "model_name": "microsoft/git-large-vqav2"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-large-printed", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed')", "performance": {"dataset": "SROIE", "accuracy": "Not provided"}, "description": "TrOCR model fine-tuned on the SROIE dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of BEiT, while the text decoder was initialized from the weights of RoBERTa.", "model_name": "microsoft/trocr-large-printed"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "albert-base-v2", "api_call": "pipeline('fill-mask', model='albert-base-v2')", "performance": {"dataset": {"SQuAD1.1": "90.2/83.2", "SQuAD2.0": "82.1/79.3", "MNLI": "84.6", "SST-2": "92.9", "RACE": "66.8"}, "accuracy": "82.3"}, "description": "ALBERT Base v2 is a transformers model pretrained on a large corpus of English data in a self-supervised fashion using a masked language modeling (MLM) objective. It was introduced in this paper and first released in this repository. This model, as all ALBERT models, is uncased: it does not make a difference between english and English.", "model_name": "albert-base-v2"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "nvidia/mit-b0", "api_call": "SegformerForImageClassification.from_pretrained('nvidia/mit-b0')", "performance": {"dataset": "imagenet_1k", "accuracy": "Not provided"}, "description": "SegFormer encoder fine-tuned on Imagenet-1k. It was introduced in the paper SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers by Xie et al. and first released in this repository. SegFormer consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great results on semantic segmentation benchmarks such as ADE20K and Cityscapes.", "model_name": "nvidia/mit-b0"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "distilroberta-base", "api_call": "pipeline('fill-mask', model='distilroberta-base')", "performance": {"dataset": "openwebtext", "accuracy": "Not provided"}, "description": "DistilRoBERTa is a distilled version of the RoBERTa-base model, designed to be smaller, faster, and lighter. It is a Transformer-based language model trained on the OpenWebTextCorpus, which is a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimensions, and 12 heads, totaling 82M parameters. It is primarily intended for fine-tuning on downstream tasks such as sequence classification, token classification, or question answering.", "model_name": "distilroberta-base"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Translation, Summarization, Question Answering, Sentiment Analysis, Regression", "api_name": "t5-large", "api_call": "T5Model.from_pretrained('t5-large')", "performance": {"dataset": "c4", "accuracy": "See research paper, Table 14"}, "description": "T5-Large is a Text-To-Text Transfer Transformer (T5) model with 770 million parameters. It is designed to handle a variety of NLP tasks, including translation, summarization, question answering, sentiment analysis, and regression. The model is pre-trained on the Colossal Clean Crawled Corpus (C4) and fine-tuned on various supervised and unsupervised tasks.", "model_name": "t5-large"}
{"domain": "Multimodal Graph Machine Learning", "framework": "Hugging Face Transformers", "functionality": "GTA5 AI model", "api_name": "GTA5_PROCESS_LEARNING_AI", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('janpase97/codeformer-pretrained')", "performance": {"dataset": "MNIST", "accuracy": "Not specified"}, "description": "This AI model is designed to train on the MNIST dataset with a specified data cap and save the trained model as an .onnx file. It can be attached to the GTA5 game process by PID and checks if the targeted application is running. The model is trained on a GPU if available.", "model_name": "janpase97/codeformer-pretrained"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face Transformers", "functionality": "Voice Activity Detection, Speech-to-Noise Ratio, and C50 Room Acoustics Estimation", "api_name": "pyannote/brouhaha", "api_call": "Model.from_pretrained('pyannote/brouhaha', use_auth_token='ACCESS_TOKEN_GOES_HERE')", "performance": {"dataset": "LibriSpeech, AudioSet, EchoThief, MIT-Acoustical-Reverberation-Scene", "accuracy": "Not provided"}, "description": "Brouhaha is a joint voice activity detection, speech-to-noise ratio, and C50 room acoustics estimation model. It is based on the PyTorch framework and uses the pyannote.audio library.", "model_name": "pyannote/brouhaha"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tuner007/pegasus_paraphrase", "api_call": "PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase')", "performance": {"dataset": "unknown", "accuracy": "unknown"}, "description": "PEGASUS fine-tuned for paraphrasing", "model_name": "tuner007/pegasus_paraphrase"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Speech Enhancement", "api_name": "speechbrain/sepformer-wham16k-enhancement", "api_call": "separator.from_hparams(source=speechbrain/sepformer-wham16k-enhancement, savedir='pretrained_models/sepformer-wham16k-enhancement')", "performance": {"dataset": "WHAM!", "accuracy": {"Test-Set SI-SNR": "14.3 dB", "Test-Set PESQ": "2.20"}}, "description": "This repository provides all the necessary tools to perform speech enhancement (denoising) with a SepFormer model, implemented with SpeechBrain, and pretrained on WHAM! dataset with 16k sampling frequency, which is basically a version of WSJ0-Mix dataset with environmental noise and reverberation in 8k.", "model_name": "speechbrain/sepformer-wham16k-enhancement"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table-based QA", "api_name": "neulab/omnitab-large", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('neulab/omnitab-large')", "performance": {"dataset": "wikitablequestions", "accuracy": null}, "description": "OmniTab is a table-based QA model proposed in OmniTab: Pretraining with Natural and Synthetic Data for Few-shot Table-based Question Answering. neulab/omnitab-large (based on BART architecture) is initialized with microsoft/tapex-large and continuously pretrained on natural and synthetic data.", "model_name": "neulab/omnitab-large"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "google/vit-base-patch16-224-in21k", "api_call": "ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')", "performance": {"dataset": "ImageNet-21k", "accuracy": "Refer to tables 2 and 5 of the original paper"}, "description": "The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224. It was introduced in the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale by Dosovitskiy et al. and first released in this repository. However, the weights were converted from the timm repository by Ross Wightman, who already converted the weights from JAX to PyTorch. Credits go to him.", "model_name": "google/vit-base-patch16-224-in21k"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "bert-base-cased", "api_call": "pipeline('fill-mask', model='bert-base-cased')", "performance": {"dataset": "GLUE", "accuracy": 79.6}, "description": "BERT base model (cased) is a pre-trained transformer model on English language using a masked language modeling (MLM) objective. It was introduced in a paper and first released in a repository. This model is case-sensitive, which means it can differentiate between 'english' and 'English'. The model can be used for masked language modeling or next sentence prediction, but it's mainly intended to be fine-tuned on a downstream task.", "model_name": "bert-base-cased"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Named Entity Recognition", "api_name": "dslim/bert-large-NER", "api_call": "AutoModelForTokenClassification.from_pretrained('dslim/bert-large-NER')", "performance": {"dataset": "conll2003", "accuracy": {"f1": 0.92, "precision": 0.92, "recall": 0.919}}, "description": "bert-large-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition and achieves state-of-the-art performance for the NER task. It has been trained to recognize four types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC).", "model_name": "dslim/bert-large-NER"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221116-110652", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221116-110652')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.40180000000000005, "Mae": 0.3272, "Rmse": 0.4546, "Abs Rel": 0.3934, "Log Mae": 0.138, "Log Rmse": 0.1907, "Delta1": 0.45980000000000004, "Delta2": 0.7659, "Delta3": 0.9082}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset. It is used for depth estimation tasks.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221116-110652"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Information Retrieval", "api_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "api_call": "AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')", "performance": {"dataset": "MS Marco Passage Reranking", "accuracy": "MRR@10: 39.01%"}, "description": "This model was trained on the MS Marco Passage Ranking task and can be used for Information Retrieval. Given a query, encode the query with all possible passages, then sort the passages in a decreasing order.", "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "mrm8488/t5-base-finetuned-summarize-news", "api_call": "AutoModelWithLMHead.from_pretrained('mrm8488/t5-base-finetuned-summarize-news')", "performance": {"dataset": "News Summary", "accuracy": "Not provided"}, "description": "Google's T5 base fine-tuned on News Summary dataset for summarization downstream task. The dataset consists of 4515 examples and contains Author_name, Headlines, Url of Article, Short text, Complete Article. Time period ranges from February to August 2017.", "model_name": "mrm8488/t5-base-finetuned-summarize-news"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Paraphrasing", "api_name": "prithivida/parrot_paraphraser_on_T5", "api_call": "Parrot(model_tag='prithivida/parrot_paraphraser_on_T5', use_gpu=False)", "performance": {"dataset": "Not mentioned", "accuracy": "Not mentioned"}, "description": "Parrot is a paraphrase based utterance augmentation framework purpose built to accelerate training NLU models. It offers knobs to control Adequacy, Fluency, and Diversity as per your needs. It mainly focuses on augmenting texts typed-into or spoken-to conversational interfaces for building robust NLU models.", "model_name": "prithivida/parrot_paraphraser_on_T5"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup')", "performance": {"dataset": "ImageNet-1k", "accuracy": "76.9"}, "description": "A series of CLIP ConvNeXt-Large (w/ extra text depth, vision MLP head) models trained on the LAION-2B (english) subset of LAION-5B using OpenCLIP. The models utilize the timm ConvNeXt-Large model (convnext_large) as the image tower, a MLP (fc - gelu - drop - fc) head in vision tower instead of the single projection of other CLIP models, and a text tower with same width but 4 layers more depth than ViT-L / RN50x16 models (depth 16, embed dim 768).", "model_name": "laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-large-finetuned-wikisql-supervised", "api_call": "pipeline('table-question-answering', model='google/tapas-large-finetuned-wikisql-supervised')", "performance": {"dataset": "wikisql", "accuracy": "Not provided"}, "description": "TAPAS is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion. It can be used for answering questions related to a table.", "model_name": "google/tapas-large-finetuned-wikisql-supervised"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-base-finetuned-wikisql", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-base-finetuned-wikisql')", "performance": {"dataset": "wikisql"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries.", "model_name": "microsoft/tapex-base-finetuned-wikisql"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-large-handwritten", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten')", "performance": {"dataset": "IAM", "accuracy": "Not specified"}, "description": "TrOCR model fine-tuned on the IAM dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository. The TrOCR model is an encoder-decoder model, consisting of an image Transformer as encoder, and a text Transformer as decoder. The image encoder was initialized from the weights of BEiT, while the text decoder was initialized from the weights of RoBERTa.", "model_name": "microsoft/trocr-large-handwritten"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "nitrosocke/nitro-diffusion", "api_call": "StableDiffusionPipeline.from_pretrained('nitrosocke/nitro-diffusion', torch_dtype=torch.float16)", "performance": {"dataset": "Stable Diffusion", "accuracy": "N/A"}, "description": "Nitro Diffusion is a fine-tuned Stable Diffusion model trained on three artstyles simultaneously while keeping each style separate from the others. It allows for high control of mixing, weighting, and single style use.", "model_name": "nitrosocke/nitro-diffusion"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/mask2former-swin-large-coco-panoptic", "api_call": "Mask2FormerForUniversalSegmentation.from_pretrained('facebook/mask2former-swin-large-coco-panoptic')", "performance": {"dataset": "COCO", "accuracy": "Not provided"}, "description": "Mask2Former model trained on COCO panoptic segmentation (large-sized version, Swin backbone). It was introduced in the paper Masked-attention Mask Transformer for Universal Image Segmentation and first released in this repository. Mask2Former addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. Hence, all 3 tasks are treated as if they were instance segmentation. Mask2Former outperforms the previous SOTA, MaskFormer both in terms of performance an efficiency.", "model_name": "facebook/mask2former-swin-large-coco-panoptic"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-ru-en", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-ru-en')", "performance": {"dataset": "newstest2019-ruen.ru.en", "accuracy": 31.4}, "description": "A Russian to English translation model developed by the Language Technology Research Group at the University of Helsinki. It is based on the Transformer-align architecture and trained on the OPUS dataset. The model can be used for translation and text-to-text generation tasks.", "model_name": "Helsinki-NLP/opus-mt-ru-en"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "cross-encoder/nli-deberta-v3-small", "api_call": "CrossEncoder('cross-encoder/nli-deberta-v3-small')", "performance": {"dataset": {"SNLI-test": "91.65", "MNLI-mismatched": "87.55"}, "accuracy": {"SNLI-test": "91.65", "MNLI-mismatched": "87.55"}}, "description": "Cross-Encoder for Natural Language Inference based on microsoft/deberta-v3-small, trained on the SNLI and MultiNLI datasets. For a given sentence pair, it will output three scores corresponding to the labels: contradiction, entailment, neutral.", "model_name": "cross-encoder/nli-deberta-v3-small"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "Tabular Regression", "api_name": "rajistics/california_housing", "api_call": "RandomForestRegressor()", "performance": {"dataset": "", "accuracy": ""}, "description": "A RandomForestRegressor model trained on the California Housing dataset for predicting housing prices.", "model_name": "RandomForestRegressor()"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "google/pegasus-cnn_dailymail", "api_call": "PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')", "performance": {"dataset": "cnn_dailymail", "accuracy": "44.16/21.56/41.30"}, "description": "PEGASUS model for abstractive summarization, pretrained on the CNN/DailyMail dataset.", "model_name": "google/pegasus-cnn_dailymail"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Zero-Shot Classification", "api_name": "valhalla/distilbart-mnli-12-9", "api_call": "pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-9')", "performance": {"dataset": "MNLI", "accuracy": {"matched_acc": 89.56, "mismatched_acc": 89.52}}, "description": "distilbart-mnli is the distilled version of bart-large-mnli created using the No Teacher Distillation technique proposed for BART summarisation by Huggingface. It is used for zero-shot text classification tasks.", "model_name": "valhalla/distilbart-mnli-12-9"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "zero-shot-object-detection", "api_name": "google/owlvit-base-patch32", "api_call": "OwlViTForObjectDetection.from_pretrained('google/owlvit-base-patch32')", "performance": {"dataset": "COCO and OpenImages", "accuracy": "Not specified"}, "description": "OWL-ViT is a zero-shot text-conditioned object detection model that uses CLIP as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. The model can be used to query an image with one or multiple text queries.", "model_name": "google/owlvit-base-patch32"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "text-generation", "api_name": "pygmalion-2.7b", "api_call": "pipeline('text-generation', model='PygmalionAI/pygmalion-2.7b')", "performance": {"dataset": "56MB of dialogue data", "accuracy": "N/A"}, "description": "Pygmalion 2.7B is a proof-of-concept dialogue model based on EleutherAI's gpt-neo-2.7B. It is fine-tuned on 56MB of dialogue data gathered from multiple sources, including real and partially machine-generated conversations. The model is intended for use in generating conversational responses and can be used with a specific input format that includes character persona, dialogue history, and user input message.", "model_name": "PygmalionAI/pygmalion-2.7b"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "wavymulder/Analog-Diffusion", "api_call": "pipeline('text-to-image', model='wavymulder/Analog-Diffusion')", "performance": {"dataset": "analog photographs", "accuracy": "Not specified"}, "description": "Analog Diffusion is a dreambooth model trained on a diverse set of analog photographs. It can generate images based on text prompts with an analog style. Use the activation token 'analog style' in your prompt to get the desired output. The model is available on the Hugging Face Inference API and can be used with the transformers library.", "model_name": "wavymulder/Analog-Diffusion"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "opus-mt-ROMANCE-en", "api_call": "MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ROMANCE-en')", "performance": {"dataset": "opus", "accuracy": {"BLEU": 62.2, "chr-F": 0.75}}, "description": "A model for translating Romance languages to English, trained on the OPUS dataset. It supports multiple source languages such as French, Spanish, Portuguese, Italian, and Romanian, among others. The model is based on the transformer architecture and uses normalization and SentencePiece for pre-processing.", "model_name": "Helsinki-NLP/opus-mt-ROMANCE-en"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Zero-Shot Classification", "api_name": "cross-encoder/nli-deberta-v3-xsmall", "api_call": "pipeline('zero-shot-classification', model='cross-encoder/nli-deberta-v3-xsmall')", "performance": {"dataset": {"SNLI-test": "91.64", "MNLI_mismatched": "87.77"}}, "description": "This model is a Cross-Encoder for Natural Language Inference, trained on the SNLI and MultiNLI datasets. It can be used for zero-shot classification tasks.", "model_name": "cross-encoder/nli-deberta-v3-xsmall"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Speech Emotion Recognition", "api_name": "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", "api_call": "Wav2Vec2ForCTC.from_pretrained('ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition')", "performance": {"dataset": "RAVDESS", "accuracy": 0.8223}, "description": "The model is a fine-tuned version of jonatasgrosman/wav2vec2-large-xlsr-53-english for a Speech Emotion Recognition (SER) task. The dataset used to fine-tune the original pre-trained model is the RAVDESS dataset. This dataset provides 1440 samples of recordings from actors performing on 8 different emotions in English, which are: emotions = ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised'].", "model_name": "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "mpariente/DPRNNTasNet-ks2_WHAM_sepclean", "api_call": "pipeline('audio-source-separation', model='mpariente/DPRNNTasNet-ks2_WHAM_sepclean')", "performance": {"dataset": "WHAM!", "si_sdr": 19.3167434907, "si_sdr_imp": 19.3178952739, "sdr": 19.6808534719, "sdr_imp": 19.5298092933, "sir": 30.3622139987, "sir_imp": 30.2111698201, "sar": 20.1555325134, "sar_imp": -129.0209176235, "stoi": 0.9777266431, "stoi_imp": 0.2396809152}, "description": "This model was trained by Manuel Pariente using the wham/DPRNN recipe in Asteroid. It was trained on the sep_clean task of the WHAM! dataset.", "model_name": "mpariente/DPRNNTasNet-ks2_WHAM_sepclean"}
{"domain": "Audio Audio-to-Audio", "framework": "Fairseq", "functionality": "speech-to-speech-translation", "api_name": "facebook/textless_sm_ro_en", "api_call": "pipeline('audio-to-audio', model='facebook/textless_sm_ro_en')", "performance": {"dataset": "unknown", "accuracy": "unknown"}, "description": "A speech-to-speech translation model for Romanian to English developed by Facebook AI", "model_name": "facebook/textless_sm_ro_en"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "distilbert-base-cased-distilled-squad", "api_call": "DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')", "performance": {"dataset": "SQuAD v1.1", "accuracy": {"Exact Match": 79.6, "F1": 86.996}}, "description": "DistilBERT base cased distilled SQuAD is a fine-tuned checkpoint of DistilBERT-base-cased, trained using knowledge distillation on SQuAD v1.1 dataset. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark. This model can be used for question answering.", "model_name": "distilbert-base-cased-distilled-squad"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Text2Text Generation", "api_name": "mrm8488/bert2bert_shared-spanish-finetuned-summarization", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('mrm8488/bert2bert_shared-spanish-finetuned-summarization')", "performance": {"dataset": "mlsum", "accuracy": {"Rouge1": 26.24, "Rouge2": 8.9, "RougeL": 21.01, "RougeLsum": 21.02}}, "description": "Spanish BERT2BERT (BETO) fine-tuned on MLSUM ES for summarization", "model_name": "mrm8488/bert2bert_shared-spanish-finetuned-summarization"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Translation", "api_name": "opus-mt-de-es", "api_call": "pipeline('translation_de_to_es', model='Helsinki-NLP/opus-mt-de-es')", "performance": {"dataset": "Tatoeba.de.es", "accuracy": {"BLEU": 48.5, "chr-F": 0.676}}, "description": "A German to Spanish translation model based on the OPUS dataset and trained using the transformer-align architecture. The model is pre-processed with normalization and SentencePiece tokenization.", "model_name": "Helsinki-NLP/opus-mt-de-es"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "patrickjohncyh/fashion-clip", "api_call": "CLIPModel.from_pretrained('patrickjohncyh/fashion-clip')", "performance": {"dataset": [{"name": "FMNIST", "accuracy": 0.8300000000000001}, {"name": "KAGL", "accuracy": 0.73}, {"name": "DEEP", "accuracy": 0.62}]}, "description": "FashionCLIP is a CLIP-based model developed to produce general product representations for fashion concepts. Leveraging the pre-trained checkpoint (ViT-B/32) released by OpenAI, it is trained on a large, high-quality novel fashion dataset to study whether domain specific fine-tuning of CLIP-like models is sufficient to produce product representations that are zero-shot transferable to entirely new datasets and tasks.", "model_name": "patrickjohncyh/fashion-clip"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "deepset/tinyroberta-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/tinyroberta-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact": 78.6911479828, "f1": 81.9198998537}}, "description": "This is the distilled version of the deepset/roberta-base-squad2 model. This model has a comparable prediction quality and runs at twice the speed of the base model.", "model_name": "deepset/tinyroberta-squad2"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/distiluse-base-multilingual-cased-v1", "api_call": "SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "N/A"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 512 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/distiluse-base-multilingual-cased-v1"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-hr-finetuned-k400", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-hr-finetuned-k400')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not specified"}, "description": "TimeSformer model pre-trained on Kinetics-400 for video classification into one of the 400 possible Kinetics-400 labels. Introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al.", "model_name": "facebook/timesformer-hr-finetuned-k400"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Engineering", "api_name": "microsoft/unixcoder-base", "api_call": "AutoModel.from_pretrained('microsoft/unixcoder-base')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "UniXcoder is a unified cross-modal pre-trained model that leverages multimodal data (i.e. code comment and AST) to pretrain code representation. Developed by Microsoft Team and shared by Hugging Face. It is based on the RoBERTa model and trained on English language data. The model can be used for feature engineering tasks.", "model_name": "microsoft/unixcoder-base"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8s-hard-hat-detection", "api_call": "YOLO('keremberke/yolov8s-hard-hat-detection')", "performance": {"dataset": "hard-hat-detection", "accuracy": 0.834}, "description": "An object detection model trained to detect hard hats and no-hard hats in images. The model is based on YOLOv8 architecture and can be used for safety applications.", "model_name": "keremberke/yolov8s-hard-hat-detection"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "text2vec-large-chinese", "api_call": "AutoModel.from_pretrained('GanymedeNil/text2vec-large-chinese')", "performance": {"dataset": "https://huggingface.co/shibing624/text2vec-base-chinese", "accuracy": "Not provided"}, "description": "A Chinese sentence similarity model based on the derivative model of https://huggingface.co/shibing624/text2vec-base-chinese, replacing MacBERT with LERT, and keeping other training conditions unchanged.", "model_name": "GanymedeNil/text2vec-large-chinese"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "lewtun/tiny-random-mt5", "api_call": "AutoModel.from_pretrained('lewtun/tiny-random-mt5')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random mt5 model for text generation", "model_name": "lewtun/tiny-random-mt5"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "SYSPIN/Marathi_Male_TTS", "api_call": "api.load('ESPnet/espnet_model_zoo:SYSPIN/Marathi_Male_TTS').", "performance": {"dataset": "", "accuracy": ""}, "description": "A Marathi Male Text-to-Speech model using ESPnet framework.", "model_name": "ESPnet/espnet_model_zoo"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "facebook/convnext-base-224", "api_call": "ConvNextForImageClassification.from_pretrained('facebook/convnext-base-224')", "performance": {"dataset": "imagenet-1k", "accuracy": null}, "description": "ConvNeXT is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers, that claims to outperform them. The authors started from a ResNet and 'modernized' its design by taking the Swin Transformer as inspiration. You can use the raw model for image classification.", "model_name": "facebook/convnext-base-224"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-roberta-large-v1", "api_call": "SentenceTransformer('sentence-transformers/all-roberta-large-v1')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Automated evaluation"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-roberta-large-v1"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "bert-base-multilingual-cased", "api_call": "pipeline('fill-mask', model='bert-base-multilingual-cased')", "performance": {"dataset": "wikipedia", "accuracy": "Not provided"}, "description": "BERT multilingual base model (cased) is pretrained on the top 104 languages with the largest Wikipedia using a masked language modeling (MLM) objective. The model is case sensitive and can be used for masked language modeling or next sentence prediction. It is intended to be fine-tuned on a downstream task, such as sequence classification, token classification, or question answering.", "model_name": "bert-base-multilingual-cased"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Token Classification", "api_name": "ckiplab/bert-base-chinese-ws", "api_call": "AutoModel.from_pretrained('ckiplab/bert-base-chinese-ws')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "This project provides traditional Chinese transformers models (including ALBERT, BERT, GPT2) and NLP tools (including word segmentation, part-of-speech tagging, named entity recognition).", "model_name": "ckiplab/bert-base-chinese-ws"}
{"domain": "Audio Audio-to-Audio", "framework": "SpeechBrain", "functionality": "Audio Source Separation", "api_name": "speechbrain/sepformer-whamr", "api_call": "separator.from_hparams(source='speechbrain/sepformer-whamr', savedir='pretrained_models/sepformer-whamr')", "performance": {"dataset": "WHAMR!", "accuracy": "13.7 dB SI-SNRi"}, "description": "This repository provides all the necessary tools to perform audio source separation with a SepFormer model, implemented with SpeechBrain, and pretrained on WHAMR! dataset, which is basically a version of WSJ0-Mix dataset with environmental noise and reverberation.", "model_name": "speechbrain/sepformer-whamr"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transcription and Translation", "api_name": "openai/whisper-tiny", "api_call": "WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')", "performance": {"dataset": "LibriSpeech (clean)", "accuracy": 7.54}, "description": "Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. It is a Transformer-based encoder-decoder model that can be used for transcription and translation tasks.", "model_name": "openai/whisper-tiny"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "facebook/tts_transformer-ru-cv7_css10", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/tts_transformer-ru-cv7_css10')", "performance": {"dataset": "common_voice", "accuracy": null}, "description": "Transformer text-to-speech model from fairseq S^2. Russian single-speaker male voice. Pre-trained on Common Voice v7, fine-tuned on CSS10.", "model_name": "facebook/tts_transformer-ru-cv7_css10"}
{"domain": "Reinforcement Learning", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "decision-transformer-gym-hopper-medium", "api_call": "AutoModel.from_pretrained('edbeeching/decision-transformer-gym-hopper-medium')", "performance": {"dataset": "Gym Hopper environment", "accuracy": "Not provided"}, "description": "Decision Transformer model trained on medium trajectories sampled from the Gym Hopper environment.", "model_name": "edbeeching/decision-transformer-gym-hopper-medium"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "luhua/chinese_pretrain_mrc_roberta_wwm_ext_large", "api_call": "pipeline('question-answering', model='luhua/chinese_pretrain_mrc_roberta_wwm_ext_large')", "performance": {"dataset": "Dureader-2021", "accuracy": "83.1"}, "description": "A Chinese MRC roberta_wwm_ext_large model trained on a large amount of Chinese MRC data. This model has significantly improved performance on reading comprehension and classification tasks. It has helped multiple users achieve top 5 results in the Dureader-2021 competition.", "model_name": "luhua/chinese_pretrain_mrc_roberta_wwm_ext_large"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "kakaobrain/align-base", "api_call": "AlignModel.from_pretrained('kakaobrain/align-base')", "performance": {"dataset": "COYO-700M", "accuracy": "on-par or outperforms Google ALIGN's reported metrics"}, "description": "The ALIGN model is a dual-encoder architecture with EfficientNet as its vision encoder and BERT as its text encoder. It learns to align visual and text representations with contrastive learning. This implementation is trained on the open source COYO dataset and can be used for zero-shot image classification and multi-modal embedding retrieval.", "model_name": "kakaobrain/align-base"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "bart-large-cnn-samsum-ChatGPT_v3", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Qiliang/bart-large-cnn-samsum-ChatGPT_v3')", "performance": {"dataset": "unknown", "accuracy": "unknown"}, "description": "This model is a fine-tuned version of philschmid/bart-large-cnn-samsum on an unknown dataset.", "model_name": "Qiliang/bart-large-cnn-samsum-ChatGPT_v3"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "Apocalypse-19/shoe-generator", "api_call": "DDPMPipeline.from_pretrained('Apocalypse-19/shoe-generator')", "performance": {"dataset": "custom dataset", "accuracy": "128x128 resolution"}, "description": "This model is a diffusion model for unconditional image generation of shoes trained on a custom dataset at 128x128 resolution.", "model_name": "Apocalypse-19/shoe-generator"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Speaker Verification", "api_name": "speechbrain/spkrec-xvect-voxceleb", "api_call": "EncoderClassifier.from_hparams(source='speechbrain/spkrec-xvect-voxceleb', savedir='pretrained_models/spkrec-xvect-voxceleb')", "performance": {"dataset": "Voxceleb1-test set (Cleaned)", "accuracy": "EER(%) 3.2"}, "description": "This repository provides all the necessary tools to extract speaker embeddings with a pretrained TDNN model using SpeechBrain. The system is trained on Voxceleb 1+ Voxceleb2 training data.", "model_name": "speechbrain/spkrec-xvect-voxceleb"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-distilroberta-v1", "api_call": "SentenceTransformer('sentence-transformers/all-distilroberta-v1')", "performance": {"dataset": [{"name": "s2orc", "accuracy": "Not provided"}, {"name": "MS Marco", "accuracy": "Not provided"}, {"name": "yahoo_answers_topics", "accuracy": "Not provided"}]}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-distilroberta-v1"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling and Next Sentence Prediction", "api_name": "bert-large-uncased", "api_call": "pipeline('fill-mask', model='bert-large-uncased')", "performance": {"dataset": {"SQUAD 1.1 F1/EM": "91.0/84.3", "Multi NLI Accuracy": "86.05"}}, "description": "BERT large model (uncased) is a transformer model pretrained on a large corpus of English data using a masked language modeling (MLM) objective. It has 24 layers, 1024 hidden dimensions, 16 attention heads, and 336M parameters. The model is intended to be fine-tuned on a downstream task, such as sequence classification, token classification, or question answering.", "model_name": "bert-large-uncased"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Diffusion-based text-to-image generation model", "api_name": "lllyasviel/control_v11p_sd15_normalbae", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_normalbae')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "ControlNet v1.1 is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on normalbae images. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5.", "model_name": "lllyasviel/control_v11p_sd15_normalbae"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "Pendulum-v1", "api_name": "ppo-Pendulum-v1", "api_call": "load_from_hub(repo_id='HumanCompatibleAI/ppo-Pendulum-v1',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "Pendulum-v1", "accuracy": "-336.89 +/- 406.36"}, "description": "This is a trained model of a PPO agent playing Pendulum-v1 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "HumanCompatibleAI/ppo-Pendulum-v1"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "mio/tokiwa_midori", "api_call": "./run.sh --skip_data_prep false --skip_train true --download_model mio/tokiwa_midori", "performance": {"dataset": "amadeus", "accuracy": "Not provided"}, "description": "This model was trained by mio using amadeus recipe in espnet.", "model_name": "mio/tokiwa_midori"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-ar", "api_call": "pipeline('translation_en_to_ar', model='Helsinki-NLP/opus-mt-en-ar')", "performance": {"dataset": "Tatoeba-test.eng.ara", "accuracy": {"BLEU": 14.0, "chr-F": 0.437}}, "description": "A Hugging Face Transformers model for English to Arabic translation, trained on the Tatoeba dataset. It uses a transformer architecture and requires a sentence initial language token in the form of '>>id<<' (id = valid target language ID).", "model_name": "Helsinki-NLP/opus-mt-en-ar"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Image Captioning", "api_name": "microsoft/git-base", "api_call": "pipeline('image-to-text', model='microsoft/git-base')", "performance": {"dataset": ["COCO", "Conceptual Captions (CC3M)", "SBU", "Visual Genome (VG)", "Conceptual Captions (CC12M)", "ALT200M"], "accuracy": "Refer to the paper for evaluation results"}, "description": "GIT (short for GenerativeImage2Text) model, base-sized version. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository. The model is trained using 'teacher forcing' on a lot of (image, text) pairs. The goal for the model is simply to predict the next text token, giving the image tokens and previous text tokens. This allows the model to be used for tasks like image and video captioning, visual question answering (VQA) on images and videos, and even image classification (by simply conditioning the model on the image and asking it to generate a class for it in text).", "model_name": "microsoft/git-base"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "stabilityai/sd-vae-ft-mse", "api_call": "StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4', vae='AutoencoderKL.from_pretrained(stabilityai/sd-vae-ft-mse)')", "performance": {"dataset": [{"name": "COCO 2017 (256x256, val, 5000 images)", "accuracy": {"rFID": "4.70", "PSNR": "24.5 +/- 3.7", "SSIM": "0.71 +/- 0.13", "PSIM": "0.92 +/- 0.27"}}, {"name": "LAION-Aesthetics 5+ (256x256, subset, 10000 images)", "accuracy": {"rFID": "1.88", "PSNR": "27.3 +/- 4.7", "SSIM": "0.83 +/- 0.11", "PSIM": "0.65 +/- 0.34"}}]}, "description": "This model is a fine-tuned VAE decoder for the Stable Diffusion Pipeline. It is designed to be used with the diffusers library and can be integrated into existing workflows by including a vae argument to the StableDiffusionPipeline. The model has been finetuned on a 1:1 ratio of LAION-Aesthetics and LAION-Humans datasets and has been evaluated on COCO 2017 and LAION-Aesthetics 5+ datasets.", "model_name": "CompVis/stable-diffusion-v1-4"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "darkstorm2150/Protogen_v2.2_Official_Release", "api_call": "StableDiffusionPipeline.from_pretrained('darkstorm2150/Protogen_v2.2_Official_Release')", "performance": {"dataset": "Various datasets", "accuracy": "Not specified"}, "description": "Protogen v2.2 is a text-to-image model that generates high-quality images based on text prompts. It was warm-started with Stable Diffusion v1-5 and fine-tuned on a large amount of data from large datasets new and trending on civitai.com. The model can be used with the Stable Diffusion Pipeline and supports trigger words like 'modelshoot style' to enforce camera capture.", "model_name": "darkstorm2150/Protogen_v2.2_Official_Release"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-MiniLM-L12-v2", "api_call": "SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')", "performance": {"dataset": "1,170,060,424 training pairs", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-MiniLM-L12-v2"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-hr-finetuned-ssv2", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-hr-finetuned-ssv2')", "performance": {"dataset": "Something Something v2", "accuracy": "Not provided"}, "description": "TimeSformer model pre-trained on Something Something v2. It was introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al. and first released in this repository.", "model_name": "facebook/timesformer-hr-finetuned-ssv2"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Masked Language Modeling", "api_name": "xlm-roberta-large", "api_call": "pipeline('fill-mask', model='xlm-roberta-large')", "performance": {"dataset": "CommonCrawl", "accuracy": "N/A"}, "description": "XLM-RoBERTa is a multilingual version of RoBERTa pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. It is designed for masked language modeling and can be fine-tuned on downstream tasks such as sequence classification, token classification, or question answering.", "model_name": "xlm-roberta-large"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/trocr-small-handwritten", "api_call": "VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten')", "performance": {"dataset": "IAM", "accuracy": "Not provided"}, "description": "TrOCR model fine-tuned on the IAM dataset. It was introduced in the paper TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models by Li et al. and first released in this repository.", "model_name": "microsoft/trocr-small-handwritten"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Hugging Face Transformers", "functionality": "Fill-Mask", "api_name": "neuralmind/bert-base-portuguese-cased", "api_call": "AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')", "performance": {"dataset": "brWaC", "accuracy": "state-of-the-art"}, "description": "BERTimbau Base is a pretrained BERT model for Brazilian Portuguese that achieves state-of-the-art performances on three downstream NLP tasks: Named Entity Recognition, Sentence Textual Similarity and Recognizing Textual Entailment. It is available in two sizes: Base and Large.", "model_name": "neuralmind/bert-base-portuguese-cased"}
{"domain": "Reinforcement Learning", "framework": "ML-Agents", "functionality": "SoccerTwos", "api_name": "Raiden-1001/poca-Soccerv7", "api_call": "mlagents-load-from-hf --repo-id='Raiden-1001/poca-Soccerv7.1' --local-dir='./downloads'", "performance": {"dataset": "SoccerTwos", "accuracy": "Not provided"}, "description": "This is a trained model of a poca agent playing SoccerTwos using the Unity ML-Agents Library.", "model_name": "Raiden-1001/poca-Soccerv7.1"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "financial-sentiment-analysis", "api_name": "yiyanghkust/finbert-tone", "api_call": "BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)", "performance": {"dataset": "10,000 manually annotated sentences from analyst reports", "accuracy": "superior performance on financial tone analysis task"}, "description": "FinBERT is a BERT model pre-trained on financial communication text. It is trained on the following three financial communication corpus: Corporate Reports 10-K & 10-Q, Earnings Call Transcripts, and Analyst Reports. This released finbert-tone model is the FinBERT model fine-tuned on 10,000 manually annotated (positive, negative, neutral) sentences from analyst reports. This model achieves superior performance on financial tone analysis task.", "model_name": "yiyanghkust/finbert-tone"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "nguyenvulebinh/wav2vec2-base-vietnamese-250h", "api_call": "Wav2Vec2ForCTC.from_pretrained('nguyenvulebinh/wav2vec2-base-vietnamese-250h')", "performance": {"dataset": [{"name": "VIVOS", "accuracy": 6.15}, {"name": "Common Voice vi", "accuracy": 11.52}]}, "description": "Vietnamese end-to-end speech recognition using wav2vec 2.0. Pre-trained on 13k hours of Vietnamese youtube audio (un-label data) and fine-tuned on 250 hours labeled of VLSP ASR dataset on 16kHz sampled speech audio.", "model_name": "nguyenvulebinh/wav2vec2-base-vietnamese-250h"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Multilingual Sequence-to-Sequence", "api_name": "facebook/mbart-large-50", "api_call": "MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50')", "performance": {"dataset": "Multilingual Denoising Pretraining", "accuracy": "Not specified"}, "description": "mBART-50 is a multilingual Sequence-to-Sequence model pre-trained using the 'Multilingual Denoising Pretraining' objective. It was introduced in Multilingual Translation with Extensible Multilingual Pretraining and Finetuning paper.", "model_name": "facebook/mbart-large-50"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "sshleifer/tiny-gpt2", "api_call": "TinyGPT2LMHeadModel.from_pretrained('sshleifer/tiny-gpt2')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A tiny GPT-2 model for text generation, suitable for low-resource environments and faster inference. This model is part of the Hugging Face Transformers library and can be used for generating text given a prompt.", "model_name": "sshleifer/tiny-gpt2"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Generative Commonsense Reasoning", "api_name": "mrm8488/t5-base-finetuned-common_gen", "api_call": "AutoModelWithLMHead.from_pretrained('mrm8488/t5-base-finetuned-common_gen')", "performance": {"dataset": "common_gen", "accuracy": {"ROUGE-2": 17.1, "ROUGE-L": 39.47}}, "description": "Google's T5 fine-tuned on CommonGen for Generative Commonsense Reasoning. CommonGen is a constrained text generation task, associated with a benchmark dataset, to explicitly test machines for the ability of generative commonsense reasoning. Given a set of common concepts; the task is to generate a coherent sentence describing an everyday scenario using these concepts.", "model_name": "mrm8488/t5-base-finetuned-common_gen"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens", "api_call": "SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "CQI_Visual_Question_Awnser_PT_v0", "api_call": "pipeline('question-answering', model=LayoutLMForQuestionAnswering.from_pretrained('microsoft/layoutlm-base-uncased'))", "performance": {"dataset": [{"accuracy": 0.9943976999999999}, {"accuracy": 0.9912158999999999}, {"accuracy": 0.59147286}]}, "description": "A model for visual question answering in Portuguese and English, capable of processing PDFs and images to extract information and answer questions.", "model_name": "microsoft/layoutlm-base-uncased"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "sentiment_analysis_generic_dataset", "api_call": "pipeline('text-classification', model='Seethal/sentiment_analysis_generic_dataset')", "performance": {"dataset": "generic_dataset", "accuracy": "Not specified"}, "description": "This is a fine-tuned downstream version of the bert-base-uncased model for sentiment analysis, this model is not intended for further downstream fine-tuning for any other tasks. This model is trained on a classified dataset for text classification.", "model_name": "Seethal/sentiment_analysis_generic_dataset"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "google/pegasus-pubmed", "api_call": "AutoModel.from_pretrained('google/pegasus-pubmed')", "performance": {"dataset": [{"name": "xsum", "accuracy": "47.60/24.83/39.64"}, {"name": "cnn_dailymail", "accuracy": "44.16/21.56/41.30"}, {"name": "newsroom", "accuracy": "45.98/34.20/42.18"}, {"name": "multi_news", "accuracy": "47.65/18.75/24.95"}, {"name": "gigaword", "accuracy": "39.65/20.47/36.76"}, {"name": "wikihow", "accuracy": "46.39/22.12/38.41"}, {"name": "reddit_tifu", "accuracy": "27.99/9.81/22.94"}, {"name": "big_patent", "accuracy": "52.29/33.08/41.66"}, {"name": "arxiv", "accuracy": "44.21/16.95/25.67"}, {"name": "pubmed", "accuracy": "45.97/20.15/28.25"}, {"name": "aeslc", "accuracy": "37.68/21.25/36.51"}, {"name": "billsum", "accuracy": "59.67/41.58/47.59"}]}, "description": "The PEGASUS model is designed for abstractive summarization. It is pretrained on a mixture of C4 and HugeNews datasets and stochastically samples important sentences. The model uses a gap sentence ratio between 15% and 45% and a sentencepiece tokenizer that encodes newline characters.", "model_name": "google/pegasus-pubmed"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "fcakyon/timesformer-hr-finetuned-k400", "api_call": "TimesformerForVideoClassification.from_pretrained('fcakyon/timesformer-hr-finetuned-k400')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not provided"}, "description": "TimeSformer model pre-trained on Kinetics-400 for video classification into one of the 400 possible Kinetics-400 labels. Introduced in the paper 'TimeSformer: Is Space-Time Attention All You Need for Video Understanding?' by Tong et al.", "model_name": "fcakyon/timesformer-hr-finetuned-k400"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face", "functionality": "Question Answering", "api_name": "impira/layoutlm-document-qa", "api_call": "pipeline('question-answering', model=LayoutLMForQuestionAnswering.from_pretrained('impira/layoutlm-document-qa', return_dict=True))", "performance": {"dataset": "SQuAD2.0 and DocVQA", "accuracy": "Not provided"}, "description": "A fine-tuned version of the multi-modal LayoutLM model for the task of question answering on documents.", "model_name": "impira/layoutlm-document-qa"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "pygmalion-350m", "api_call": "pipeline('conversational', model='PygmalionAI/pygmalion-350m')", "performance": {"dataset": "The Pile", "accuracy": "N/A"}, "description": "This is a proof-of-concept fine-tune of Facebook's OPT-350M model optimized for dialogue, to be used as a stepping stone to higher parameter models. Disclaimer: NSFW data was included in the fine-tuning of this model. Although SFW inputs will usually result in SFW outputs, you are advised to chat at your own risk. This model is not suitable for use by minors.", "model_name": "PygmalionAI/pygmalion-350m"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/git-base-vqav2", "api_call": "pipeline('visual-question-answering', model='microsoft/git-base-vqav2')", "performance": {"dataset": "VQAv2", "accuracy": "Refer to the paper for evaluation results"}, "description": "GIT (short for GenerativeImage2Text) model, base-sized version, fine-tuned on VQAv2. It was introduced in the paper GIT: A Generative Image-to-text Transformer for Vision and Language by Wang et al. and first released in this repository.", "model_name": "microsoft/git-base-vqav2"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "vicgalle/xlm-roberta-large-xnli-anli", "api_call": "XLMRobertaForSequenceClassification.from_pretrained('vicgalle/xlm-roberta-large-xnli-anli')", "performance": {"dataset": [{"name": "XNLI-es", "accuracy": "93.7%"}, {"name": "XNLI-fr", "accuracy": "93.2%"}, {"name": "ANLI-R1", "accuracy": "68.5%"}, {"name": "ANLI-R2", "accuracy": "53.6%"}, {"name": "ANLI-R3", "accuracy": "49.0%"}]}, "description": "XLM-RoBERTa-large model finetunned over several NLI datasets, ready to use for zero-shot classification.", "model_name": "vicgalle/xlm-roberta-large-xnli-anli"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "facebook/regnet-y-008", "api_call": "RegNetForImageClassification.from_pretrained('zuppif/regnet-y-040')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "RegNet model trained on imagenet-1k. It was introduced in the paper Designing Network Design Spaces and first released in this repository.", "model_name": "zuppif/regnet-y-040"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli", "api_call": "mDeBERTaForSequenceClassification.from_pretrained('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli')", "performance": {"dataset": {"average": 0.808, "ar": 0.802, "bg": 0.8290000000000001, "de": 0.8250000000000001, "el": 0.8260000000000001, "en": 0.883, "es": 0.845, "fr": 0.834, "hi": 0.771, "ru": 0.8130000000000001, "sw": 0.748, "th": 0.793, "tr": 0.807, "ur": 0.74, "vi": 0.795, "zh": 0.8116}, "accuracy": "0.808"}, "description": "This multilingual model can perform natural language inference (NLI) on 100 languages and is therefore also suitable for multilingual zero-shot classification. The underlying model was pre-trained by Microsoft on the CC100 multilingual dataset. It was then fine-tuned on the XNLI dataset, which contains hypothesis-premise pairs from 15 languages, as well as the English MNLI dataset.", "model_name": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "philschmid/distilbert-onnx", "api_call": "pipeline('question-answering', model='philschmid/distilbert-onnx')", "performance": {"dataset": "squad", "accuracy": "F1 score: 87.1"}, "description": "This model is a fine-tune checkpoint of DistilBERT-base-cased, fine-tuned using (a second step of) knowledge distillation on SQuAD v1.1.", "model_name": "philschmid/distilbert-onnx"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "Awais/Audio_Source_Separation", "api_call": "pipeline('audio-source-separation', model='Awais/Audio_Source_Separation')", "performance": {"dataset": "Libri2Mix", "accuracy": {"si_sdr": 14.7645436345, "si_sdr_imp": 14.7640293756, "sdr": 15.2933797075, "sdr_imp": 15.1141466051, "sir": 24.0929046611, "sir_imp": 23.9136696831, "sar": 16.0605590692, "sar_imp": -51.9807844413, "stoi": 0.9311142441, "stoi_imp": 0.2181737614}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the sep_clean task of the Libri2Mix dataset.", "model_name": "Awais/Audio_Source_Separation"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "google/bigbird-pegasus-large-arxiv", "api_call": "BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-arxiv')", "performance": {"dataset": "scientific_papers", "accuracy": {"ROUGE-1": 36.028, "ROUGE-2": 13.417, "ROUGE-L": 21.961, "ROUGE-LSUM": 29.648}}, "description": "BigBird, is a sparse-attention based transformer which extends Transformer based models, such as BERT to much longer sequences. Moreover, BigBird comes along with a theoretical understanding of the capabilities of a complete transformer that the sparse model can handle. BigBird was introduced in this paper and first released in this repository. BigBird relies on block sparse attention instead of normal attention (i.e. BERT's attention) and can handle sequences up to a length of 4096 at a much lower compute cost compared to BERT. It has achieved SOTA on various tasks involving very long sequences such as long documents summarization, question-answering with long contexts.", "model_name": "google/bigbird-pegasus-large-arxiv"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Diffusion-based text-to-image generation", "api_name": "lllyasviel/control_v11p_sd15_softedge", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_softedge')", "performance": {"dataset": "ControlNet", "accuracy": "Not provided"}, "description": "Controlnet v1.1 is a diffusion-based text-to-image generation model that controls pretrained large diffusion models to support additional input conditions. This checkpoint corresponds to the ControlNet conditioned on Soft edges. It can be used in combination with Stable Diffusion, such as runwayml/stable-diffusion-v1-5.", "model_name": "lllyasviel/control_v11p_sd15_softedge"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "hustvl/yolos-small", "api_call": "YolosForObjectDetection.from_pretrained('hustvl/yolos-small')", "performance": {"dataset": "COCO 2017 validation", "accuracy": "36.1 AP"}, "description": "YOLOS model fine-tuned on COCO 2017 object detection (118k annotated images). It was introduced in the paper You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection by Fang et al. and first released in this repository. YOLOS is a Vision Transformer (ViT) trained using the DETR loss. Despite its simplicity, a base-sized YOLOS model is able to achieve 42 AP on COCO validation 2017 (similar to DETR and more complex frameworks such as Faster R-CNN).", "model_name": "hustvl/yolos-small"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "hustvl/yolos-tiny", "api_call": "YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')", "performance": {"dataset": "COCO 2017 validation", "accuracy": "28.7 AP"}, "description": "YOLOS is a Vision Transformer (ViT) trained using the DETR loss. Despite its simplicity, a base-sized YOLOS model is able to achieve 42 AP on COCO validation 2017 (similar to DETR and more complex frameworks such as Faster R-CNN). The model is trained using a bipartite matching loss: one compares the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N (so if an image only contains 4 objects, 96 annotations will just have a no object as class and no bounding box as bounding box). The Hungarian matching algorithm is used to create an optimal one-to-one mapping between each of the N queries and each of the N annotations. Next, standard cross-entropy (for the classes) and a linear combination of the L1 and generalized IoU loss (for the bounding boxes) are used to optimize the parameters of the model.", "model_name": "hustvl/yolos-tiny"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Automatic Speech Recognition", "api_name": "facebook/hubert-large-ls960-ft", "api_call": "HubertForCTC.from_pretrained('facebook/hubert-large-ls960-ft')", "performance": {"dataset": "LibriSpeech (clean)", "accuracy": "1.900 WER"}, "description": "Facebook's Hubert-Large-Finetuned is an Automatic Speech Recognition model fine-tuned on 960h of Librispeech on 16kHz sampled speech audio. It is based on the Hidden-Unit BERT (HuBERT) approach for self-supervised speech representation learning, which utilizes an offline clustering step to provide aligned target labels for a BERT-like prediction loss. The model either matches or improves upon the state-of-the-art wav2vec 2.0 performance on the Librispeech and Libri-light benchmarks with various fine-tuning subsets.", "model_name": "facebook/hubert-large-ls960-ft"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Diffusion Models", "api_name": "lllyasviel/control_v11p_sd15s2_lineart_anime", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15s2_lineart_anime')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on lineart_anime images.", "model_name": "lllyasviel/control_v11p_sd15s2_lineart_anime"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Abstractive Russian Summarization", "api_name": "cointegrated/rut5-base-absum", "api_call": "T5ForConditionalGeneration.from_pretrained('cointegrated/rut5-base-absum')", "performance": {"dataset": ["csebuetnlp/xlsum", "IlyaGusev/gazeta", "mlsum"], "accuracy": "Not provided"}, "description": "This is a model for abstractive Russian summarization, based on cointegrated/rut5-base-multitask and fine-tuned on 4 datasets.", "model_name": "cointegrated/rut5-base-absum"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "sd-class-pandas-32", "api_call": "DDPMPipeline.from_pretrained('schdoel/sd-class-AFHQ-32')", "performance": {"dataset": "AFHQ", "accuracy": "Not provided"}, "description": "This model is a diffusion model for unconditional image generation of cute \ud83e\udd8b.", "model_name": "schdoel/sd-class-AFHQ-32"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-nlf-head-detection", "api_call": "YOLO('keremberke/yolov8m-nlf-head-detection')", "performance": {"dataset": "nfl-object-detection", "accuracy": 0.28700000000000003}, "description": "A YOLOv8 model trained for head detection in American football. The model is capable of detecting helmets, blurred helmets, difficult helmets, partial helmets, and sideline helmets.", "model_name": "keremberke/yolov8m-nlf-head-detection"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video Generation", "api_name": "redshift-man-skiing", "api_call": "TuneAVideoPipeline.from_pretrained('nitrosocke/redshift-diffusion', unet=UNet3DConditionModel.from_pretrained('Tune-A-Video-library/redshift-man-skiing', subfolder='unet', torch_dtype=torch.float16), torch_dtype=torch.float16)", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "Tune-A-Video - Redshift is a text-to-video generation model based on the nitrosocke/redshift-diffusion model. It generates videos based on textual prompts, such as 'a man is skiing' or '(redshift style) spider man is skiing'.", "model_name": "nitrosocke/redshift-diffusion"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-es-en", "api_call": "pipeline('translation_es_to_en', model='Helsinki-NLP/opus-mt-es-en')", "performance": {"dataset": [{"name": "newssyscomb2009-spaeng.spa.eng", "accuracy": {"BLEU": 30.6, "chr-F": 0.5700000000000001}}, {"name": "news-test2008-spaeng.spa.eng", "accuracy": {"BLEU": 27.9, "chr-F": 0.553}}, {"name": "newstest2009-spaeng.spa.eng", "accuracy": {"BLEU": 30.4, "chr-F": 0.5720000000000001}}, {"name": "newstest2010-spaeng.spa.eng", "accuracy": {"BLEU": 36.1, "chr-F": 0.614}}, {"name": "newstest2011-spaeng.spa.eng", "accuracy": {"BLEU": 34.2, "chr-F": 0.599}}, {"name": "newstest2012-spaeng.spa.eng", "accuracy": {"BLEU": 37.9, "chr-F": 0.624}}, {"name": "newstest2013-spaeng.spa.eng", "accuracy": {"BLEU": 35.3, "chr-F": 0.609}}, {"name": "Tatoeba-test.spa.eng", "accuracy": {"BLEU": 59.6, "chr-F": 0.739}}]}, "description": "Helsinki-NLP/opus-mt-es-en is a machine translation model trained to translate from Spanish to English using the Hugging Face Transformers library. The model is based on the Marian framework and was trained on the OPUS dataset.", "model_name": "Helsinki-NLP/opus-mt-es-en"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "emilyalsentzer/Bio_ClinicalBERT", "api_call": "AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')", "performance": {"dataset": "MIMIC III", "accuracy": "Not provided"}, "description": "Bio_ClinicalBERT is a model initialized with BioBERT and trained on all MIMIC notes. It can be used for various NLP tasks in the clinical domain, such as Named Entity Recognition (NER) and Natural Language Inference (NLI).", "model_name": "emilyalsentzer/Bio_ClinicalBERT"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "DCCRNet_Libri1Mix_enhsingle_16k", "api_call": "AutoModelForAudioToAudio.from_pretrained('JorisCos/DCCRNet_Libri1Mix_enhsingle_16k')", "performance": {"dataset": "Libri1Mix", "accuracy": {"si_sdr": 13.3297673983, "si_sdr_imp": 9.8799860925, "sdr": 13.87279933, "sdr_imp": 10.3701365308, "sir": "Infinity", "sir_imp": "NaN", "sar": 13.87279933, "sar_imp": 10.3701365308, "stoi": 0.9140907016, "stoi_imp": 0.11817087800000001}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the enh_single task of the Libri1Mix dataset.", "model_name": "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "videomae-base-finetuned-RealLifeViolenceSituations-subset", "api_call": "AutoModelForVideoClassification.from_pretrained('dangle124/videomae-base-finetuned-RealLifeViolenceSituations-subset')", "performance": {"dataset": "unknown", "accuracy": 0.9533}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base on an unknown dataset. It is trained for video classification task, specifically for RealLifeViolenceSituations.", "model_name": "dangle124/videomae-base-finetuned-RealLifeViolenceSituations-subset"}
{"domain": "Natural Language Processing Summarization", "framework": "Transformers", "functionality": "Text Summarization", "api_name": "sshleifer/distilbart-cnn-6-6", "api_call": "BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-6-6')", "performance": {"dataset": {"cnn_dailymail": {"Rouge 2": 20.17, "Rouge-L": 29.7}, "xsum": {"Rouge 2": 20.92, "Rouge-L": 35.73}}}, "description": "DistilBART model for text summarization, trained on the CNN/Daily Mail and XSum datasets. It is a smaller and faster version of BART, suitable for summarizing English text.", "model_name": "sshleifer/distilbart-cnn-6-6"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "cerebras/Cerebras-GPT-111M", "api_call": "AutoModelForCausalLM.from_pretrained('cerebras/Cerebras-GPT-111M')", "performance": {"dataset": "The Pile", "accuracy": {"PILE_test_xent": 2.566, "Hella-Swag": 0.268, "PIQA": 0.594, "Wino-Grande": 0.488, "Lambada": 0.194, "ARC-e": 0.38, "ARC-c": 0.166, "OpenBookQA": 0.11800000000000001, "Downstream_Average": 0.315}}, "description": "Cerebras-GPT-111M is a transformer-based language model with 111M parameters, trained on the Pile dataset using the GPT-3 style architecture. It is intended for use in research and as a foundation model for NLP applications, ethics, and alignment research. The model can be fine-tuned for various tasks and is licensed under Apache 2.0.", "model_name": "cerebras/Cerebras-GPT-111M"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-kitti-finetuned-diode", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-kitti-finetuned-diode')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.5845, "Rmse": 0.6175}}, "description": "This model is a fine-tuned version of vinvino02/glpn-kitti on the diode-subset dataset.", "model_name": "sayakpaul/glpn-kitti-finetuned-diode"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Zero-Shot Classification", "api_name": "valhalla/distilbart-mnli-12-6", "api_call": "pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-6')", "performance": {"dataset": "MNLI", "accuracy": {"matched_acc": "89.19", "mismatched_acc": "89.01"}}, "description": "distilbart-mnli is the distilled version of bart-large-mnli created using the No Teacher Distillation technique proposed for BART summarisation by Huggingface. It is designed for zero-shot classification tasks.", "model_name": "valhalla/distilbart-mnli-12-6"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", "api_call": "pipeline('zero-shot-classification', model='laion/CLIP-ViT-B-32-laion2B-s34B-b79K')", "performance": {"dataset": "ImageNet-1k", "accuracy": 66.6}, "description": "A CLIP ViT-B/32 model trained with the LAION-2B English subset of LAION-5B using OpenCLIP. It enables researchers to better understand and explore zero-shot, arbitrary image classification. The model can be used for zero-shot image classification, image and text retrieval, among others.", "model_name": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Speech Emotion Recognition", "api_name": "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", "api_call": "EmotionModel.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim')", "performance": {"dataset": "msp-podcast", "accuracy": "Not provided"}, "description": "Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0. The model expects a raw audio signal as input and outputs predictions for arousal, dominance and valence in a range of approximately 0...1. In addition, it also provides the pooled states of the last transformer layer. The model was created by fine-tuning Wav2Vec2-Large-Robust on MSP-Podcast (v1.7). The model was pruned from 24 to 12 transformer layers before fine-tuning. An ONNX export of the model is available from doi:10.5281/zenodo.6221127. Further details are given in the associated paper and tutorial.", "model_name": "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "Babelscape/wikineural-multilingual-ner", "api_call": "AutoModelForTokenClassification.from_pretrained('Babelscape/wikineural-multilingual-ner')", "performance": {"dataset": "Babelscape/wikineural-multilingual-ner", "accuracy": "span-based F1-score up to 6 points over previous state-of-the-art systems for data creation"}, "description": "A multilingual Named Entity Recognition (NER) model fine-tuned on the WikiNEuRal dataset, supporting 9 languages (de, en, es, fr, it, nl, pl, pt, ru). It is based on the mBERT architecture and trained on all 9 languages jointly. The model can be used with the Hugging Face Transformers pipeline for NER tasks.", "model_name": "Babelscape/wikineural-multilingual-ner"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "facebook/timesformer-base-finetuned-k400", "api_call": "TimesformerForVideoClassification.from_pretrained('facebook/timesformer-base-finetuned-k400')", "performance": {"dataset": "Kinetics-400", "accuracy": "Not provided"}, "description": "TimeSformer is a video classification model pre-trained on Kinetics-400. It was introduced in the paper TimeSformer: Is Space-Time Attention All You Need for Video Understanding? by Tong et al. and first released in this repository. The model can be used for video classification into one of the 400 possible Kinetics-400 labels.", "model_name": "facebook/timesformer-base-finetuned-k400"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "microsoft/codebert-base", "api_call": "AutoModel.from_pretrained('microsoft/codebert-base')", "performance": {"dataset": "CodeSearchNet", "accuracy": "n/a"}, "description": "Pretrained weights for CodeBERT: A Pre-Trained Model for Programming and Natural Languages. The model is trained on bi-modal data (documents & code) of CodeSearchNet. This model is initialized with Roberta-base and trained with MLM+RTD objective.", "model_name": "microsoft/codebert-base"}
{"domain": "Audio Text-to-Speech", "framework": "Hugging Face Transformers", "functionality": "Text-to-Speech", "api_name": "microsoft/speecht5_tts", "api_call": "SpeechT5ForTextToSpeech.from_pretrained('microsoft/speecht5_tts')", "performance": {"dataset": "LibriTTS", "accuracy": "Not specified"}, "description": "SpeechT5 model fine-tuned for speech synthesis (text-to-speech) on LibriTTS. It is a unified-modal SpeechT5 framework that explores the encoder-decoder pre-training for self-supervised speech/text representation learning. It can be used for a wide variety of spoken language processing tasks, including automatic speech recognition, speech synthesis, speech translation, voice conversion, speech enhancement, and speaker identification.", "model_name": "microsoft/speecht5_tts"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Sentiment Analysis", "api_name": "siebert/sentiment-roberta-large-english", "api_call": "pipeline('sentiment-analysis', model='siebert/sentiment-roberta-large-english')", "performance": {"dataset": [{"name": "McAuley and Leskovec (2013) (Reviews)", "accuracy": 98.0}, {"name": "McAuley and Leskovec (2013) (Review Titles)", "accuracy": 87.0}, {"name": "Yelp Academic Dataset", "accuracy": 96.5}, {"name": "Maas et al. (2011)", "accuracy": 96.0}, {"name": "Kaggle", "accuracy": 96.0}, {"name": "Pang and Lee (2005)", "accuracy": 91.0}, {"name": "Nakov et al. (2013)", "accuracy": 88.5}, {"name": "Shamma (2009)", "accuracy": 87.0}, {"name": "Blitzer et al. (2007) (Books)", "accuracy": 92.5}, {"name": "Blitzer et al. (2007) (DVDs)", "accuracy": 92.5}, {"name": "Blitzer et al. (2007) (Electronics)", "accuracy": 95.0}, {"name": "Blitzer et al. (2007) (Kitchen devices)", "accuracy": 98.5}, {"name": "Pang et al. (2002)", "accuracy": 95.5}, {"name": "Speriosu et al. (2011)", "accuracy": 85.5}, {"name": "Hartmann et al. (2019)", "accuracy": 98.0}], "average_accuracy": 93.2}, "description": "This model ('SiEBERT', prefix for 'Sentiment in English') is a fine-tuned checkpoint of RoBERTa-large (Liu et al. 2019). It enables reliable binary sentiment analysis for various types of English-language text. For each instance, it predicts either positive (1) or negative (0) sentiment. The model was fine-tuned and evaluated on 15 data sets from diverse text sources to enhance generalization across different types of texts (reviews, tweets, etc.). Consequently, it outperforms models trained on only one type of text (e.g., movie reviews from the popular SST-2 benchmark) when used on new data as shown below.", "model_name": "siebert/sentiment-roberta-large-english"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Speech Enhancement", "api_name": "speechbrain/mtl-mimic-voicebank", "api_call": "WaveformEnhancement.from_hparams('speechbrain/mtl-mimic-voicebank', 'pretrained_models/mtl-mimic-voicebank')", "performance": {"dataset": "Voicebank", "accuracy": {"Test PESQ": 3.05, "Test COVL": 3.74, "Valid WER": 2.89, "Test WER": 2.8}}, "description": "This repository provides all the necessary tools to perform enhancement and\nrobust ASR training (EN) within\nSpeechBrain. For a better experience we encourage you to learn more about\nSpeechBrain. The model performance is:\nRelease\nTest PESQ\nTest COVL\nValid WER\nTest WER\n22-06-21\n3.05\n3.74\n2.89\n2.80\nWorks with SpeechBrain v0.5.12", "model_name": "speechbrain/mtl-mimic-voicebank"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "Zixtrauce/JohnBot", "api_call": "AutoModelForCausalLM.from_pretrained('Zixtrauce/JohnBot')", "performance": {"dataset": "", "accuracy": ""}, "description": "JohnBot is a conversational model based on the gpt2 architecture and trained using the Hugging Face Transformers library. It can be used for generating text responses in a chat-based interface.", "model_name": "Zixtrauce/JohnBot"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/wav2vec2-xlsr-53-espeak-cv-ft", "api_call": "Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-xlsr-53-espeak-cv-ft')", "performance": {"dataset": "common_voice", "accuracy": "Not specified"}, "description": "Wav2Vec2-Large-XLSR-53 finetuned on multi-lingual Common Voice for phonetic label recognition in multiple languages. The model outputs a string of phonetic labels, and a dictionary mapping phonetic labels to words has to be used to map the phonetic output labels to output words.", "model_name": "facebook/wav2vec2-xlsr-53-espeak-cv-ft"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "google/pix2struct-textcaps-base", "api_call": "Pix2StructForConditionalGeneration.from_pretrained('google/pix2struct-textcaps-base')", "performance": {"dataset": "TextCaps", "accuracy": "state-of-the-art"}, "description": "Pix2Struct is an image encoder - text decoder model that is trained on image-text pairs for various tasks, including image captioning and visual question answering. It is pretrained by learning to parse masked screenshots of web pages into simplified HTML. The web, with its richness of visual elements cleanly reflected in the HTML structure, provides a large source of pretraining data well suited to the diversity of downstream tasks.", "model_name": "google/pix2struct-textcaps-base"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/speecht5_vc", "api_call": "SpeechT5ForSpeechToSpeech.from_pretrained('microsoft/speecht5_vc')", "performance": {"dataset": "CMU ARCTIC", "accuracy": "Not specified"}, "description": "SpeechT5 model fine-tuned for voice conversion (speech-to-speech) on CMU ARCTIC. The SpeechT5 framework consists of a shared encoder-decoder network and six modal-specific (speech/text) pre/post-nets. It is designed to improve the modeling capability for both speech and text. This model can be used for speech conversion tasks.", "model_name": "microsoft/speecht5_vc"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Sentiment Analysis", "api_name": "cardiffnlp/twitter-roberta-base-sentiment", "api_call": "AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')", "performance": {"dataset": "tweet_eval", "accuracy": "Not provided"}, "description": "Twitter-roBERTa-base for Sentiment Analysis. This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. This model is suitable for English.", "model_name": "cardiffnlp/twitter-roberta-base-sentiment"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "mio/Artoria", "api_call": "pipeline('text-to-speech', model='mio/Artoria')", "performance": {"dataset": "fate", "accuracy": "Not provided"}, "description": "This model was trained by mio using fate recipe in espnet. It is a text-to-speech model that can convert text input into speech output.", "model_name": "mio/Artoria"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Image-to-Image", "api_name": "GreeneryScenery/SheepsControlV5", "api_call": "pipeline('image-to-image', model='GreeneryScenery/SheepsControlV5')", "performance": {"dataset": "poloclub/diffusiondb", "accuracy": "Not provided"}, "description": "SheepsControlV5 is an image-to-image model trained on the poloclub/diffusiondb dataset. It is designed for transforming input images into a different style or representation.", "model_name": "GreeneryScenery/SheepsControlV5"}
{"domain": "Audio Voice Activity Detection", "framework": "pyannote.audio", "functionality": "Speaker Diarization", "api_name": "philschmid/pyannote-speaker-diarization-endpoint", "api_call": "Pipeline.from_pretrained('philschmid/pyannote-speaker-diarization-endpoint')", "performance": {"dataset": [{"name": "AISHELL-4", "accuracy": {"DER%": 14.61, "FA%": 3.31, "Miss%": 4.35, "Conf%": 6.95}}, {"name": "AMI Mix-Headset only_words", "accuracy": {"DER%": 18.21, "FA%": 3.2800000000000002, "Miss%": 11.07, "Conf%": 3.87}}, {"name": "AMI Array1-01 only_words", "accuracy": {"DER%": 29.0, "FA%": 2.71, "Miss%": 21.61, "Conf%": 4.68}}, {"name": "CALLHOME Part2", "accuracy": {"DER%": 30.24, "FA%": 3.71, "Miss%": 16.86, "Conf%": 9.66}}, {"name": "DIHARD 3 Full", "accuracy": {"DER%": 20.99, "FA%": 4.25, "Miss%": 10.74, "Conf%": 6.0}}, {"name": "REPERE Phase 2", "accuracy": {"DER%": 12.62, "FA%": 1.55, "Miss%": 3.3, "Conf%": 7.76}}, {"name": "VoxConverse v0.0.2", "accuracy": {"DER%": 12.76, "FA%": 3.45, "Miss%": 3.85, "Conf%": 5.46}}]}, "description": "A speaker diarization pipeline that uses pyannote.audio to perform voice activity detection, speaker change detection, and overlapped speech detection. It can handle fully automatic processing with no manual intervention and can be fine-tuned with various hyperparameters.", "model_name": "philschmid/pyannote-speaker-diarization-endpoint"}
{"domain": "Audio Audio-to-Audio", "framework": "SpeechBrain", "functionality": "Audio Source Separation", "api_name": "speechbrain/sepformer-wham", "api_call": "separator.from_hparams(source='speechbrain/sepformer-wham', savedir='pretrained_models/sepformer-wham')", "performance": {"dataset": "WHAM!", "accuracy": "16.3 dB SI-SNRi"}, "description": "This repository provides all the necessary tools to perform audio source separation with a SepFormer model, implemented with SpeechBrain, and pretrained on WHAM! dataset, which is basically a version of WSJ0-Mix dataset with environmental noise.", "model_name": "speechbrain/sepformer-wham"}
{"domain": "Audio Automatic Speech Recognition", "framework": "pyannote.audio", "functionality": "overlapped-speech-detection", "api_name": "pyannote/overlapped-speech-detection", "api_call": "pipeline.from_pretrained('pyannote/overlapped-speech-detection', use_auth_token='ACCESS_TOKEN_GOES_HERE')", "performance": {"dataset": "ami", "accuracy": null}, "description": "Automatic overlapped speech detection using pyannote.audio framework. The model detects when two or more speakers are active in an audio file.", "model_name": "pyannote/overlapped-speech-detection"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Generation", "api_name": "CompVis/stable-diffusion-v1-4", "api_call": "StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4')", "performance": {"dataset": "COCO2017 validation set", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input. The Stable-Diffusion-v1-4 checkpoint was fine-tuned on 225k steps at resolution 512x512 on laion-aesthetics v2 5+ and 10% dropping of the text-conditioning to improve classifier-free guidance sampling. This model is intended for research purposes and can be used for generating artworks, design, educational or creative tools, and research on generative models.", "model_name": "CompVis/stable-diffusion-v1-4"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "codet5-large-ntp-py", "api_call": "T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-large-ntp-py')", "performance": {"dataset": "APPS benchmark", "accuracy": "See Table 5 of the paper"}, "description": "CodeT5 is a family of encoder-decoder language models for code from the paper: CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation by Yue Wang, Weishi Wang, Shafiq Joty, and Steven C.H. Hoi. The checkpoint included in this repository is denoted as CodeT5-large-ntp-py (770M), which is introduced by the paper: CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning by Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, Steven C.H. Hoi.", "model_name": "Salesforce/codet5-large-ntp-py"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/roberta-large-squad2", "api_call": "pipeline('question-answering', model='deepset/roberta-large-squad2')", "performance": {"dataset": "squad_v2", "accuracy": "Not provided"}, "description": "A pre-trained RoBERTa model for question answering tasks, specifically trained on the SQuAD v2 dataset. It can be used to answer questions based on a given context.", "model_name": "deepset/roberta-large-squad2"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Multilingual Translation", "api_name": "facebook/m2m100_418M", "api_call": "M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')", "performance": {"dataset": "WMT", "accuracy": "Not provided"}, "description": "M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It can directly translate between the 9,900 directions of 100 languages. To translate into a target language, the target language id is forced as the first generated token.", "model_name": "facebook/m2m100_418M"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "dmis-lab/biobert-v1.1", "api_call": "AutoModel.from_pretrained('dmis-lab/biobert-v1.1')", "performance": {"dataset": "", "accuracy": ""}, "description": "BioBERT is a pre-trained biomedical language representation model for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, and question answering.", "model_name": "dmis-lab/biobert-v1.1"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/roberta-base-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact": 79.8702939442, "f1": 82.9125116958}}, "description": "This is the roberta-base model, fine-tuned using the SQuAD2.0 dataset for the task of Question Answering. It's been trained on question-answer pairs, including unanswerable questions.", "model_name": "deepset/roberta-base-squad2"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "ast-finetuned-speech-commands-v2", "api_call": "AutoModelForAudioClassification.from_pretrained('MIT/ast-finetuned-speech-commands-v2')", "performance": {"dataset": "Speech Commands v2", "accuracy": "98.120"}, "description": "Audio Spectrogram Transformer (AST) model fine-tuned on Speech Commands v2. It was introduced in the paper AST: Audio Spectrogram Transformer by Gong et al. and first released in this repository. The Audio Spectrogram Transformer is equivalent to ViT, but applied on audio. Audio is first turned into an image (as a spectrogram), after which a Vision Transformer is applied. The model gets state-of-the-art results on several audio classification benchmarks.", "model_name": "MIT/ast-finetuned-speech-commands-v2"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "vintedois-diffusion-v0-1", "api_call": "pipeline('text-to-image', model='22h/vintedois-diffusion-v0-1')", "performance": {"dataset": "large amount of high quality images", "accuracy": "not specified"}, "description": "Vintedois (22h) Diffusion model trained by Predogl and piEsposito with open weights, configs and prompts. This model generates beautiful images without a lot of prompt engineering. It can also generate high fidelity faces with a little amount of steps.", "model_name": "22h/vintedois-diffusion-v0-1"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "pygmalion-6b", "api_call": "AutoModelForCausalLM.from_pretrained('waifu-workshop/pygmalion-6b')", "performance": {"dataset": "56MB of dialogue data gathered from multiple sources", "accuracy": "Not specified"}, "description": "Pygmalion 6B is a proof-of-concept dialogue model based on EleutherAI's GPT-J-6B. It is fine-tuned on 56MB of dialogue data gathered from multiple sources, which includes both real and partially machine-generated conversations. The model is intended for conversational text generation and can be used to play a character in a dialogue.", "model_name": "waifu-workshop/pygmalion-6b"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "blip2-opt-6.7b", "api_call": "pipeline('text2text-generation', model='salesforce/blip2-opt-6.7b')", "performance": {"dataset": "LAION", "accuracy": "Not specified"}, "description": "BLIP-2 model, leveraging OPT-6.7b (a large language model with 6.7 billion parameters). It was introduced in the paper BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models by Li et al. and first released in this repository. The goal for the model is to predict the next text token, giving the query embeddings and the previous text. This allows the model to be used for tasks like image captioning, visual question answering (VQA), and chat-like conversations by feeding the image and the previous conversation as prompt to the model.", "model_name": "salesforce/blip2-opt-6.7b"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Language model", "api_name": "google/flan-t5-large", "api_call": "T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')", "performance": {"dataset": [{"name": "MMLU", "accuracy": "75.2%"}]}, "description": "FLAN-T5 large is a language model fine-tuned on over 1000 tasks and multiple languages. It achieves state-of-the-art performance on several benchmarks, including 75.2% on five-shot MMLU. The model is based on pretrained T5 and fine-tuned with instructions for better zero-shot and few-shot performance. It can be used for research on language models, zero-shot NLP tasks, in-context few-shot learning NLP tasks, reasoning, question answering, and advancing fairness and safety research.", "model_name": "google/flan-t5-large"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/beit-base-patch16-224-pt22k-ft22k", "api_call": "BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')", "performance": {"dataset": "ImageNet-22k", "accuracy": "Not specified"}, "description": "BEiT model pre-trained in a self-supervised fashion on ImageNet-22k - also called ImageNet-21k (14 million images, 21,841 classes) at resolution 224x224, and fine-tuned on the same dataset at resolution 224x224. It was introduced in the paper BEIT: BERT Pre-Training of Image Transformers by Hangbo Bao, Li Dong and Furu Wei and first released in this repository.", "model_name": "microsoft/beit-base-patch16-224-pt22k-ft22k"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-de", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-de')", "performance": {"dataset": "newstest2018-ende.en.de", "accuracy": {"BLEU": 45.2, "chr-F": 0.6900000000000001}}, "description": "The Helsinki-NLP/opus-mt-en-de model is a translation model developed by the Language Technology Research Group at the University of Helsinki. It translates English text to German using the Hugging Face Transformers library. The model is trained on the OPUS dataset and has a BLEU score of 45.2 on the newstest2018-ende.en.de dataset.", "model_name": "Helsinki-NLP/opus-mt-en-de"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "uclanlp/visualbert-vqa", "api_call": "AutoModelForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')", "performance": {"dataset": "", "accuracy": ""}, "description": "A VisualBERT model for Visual Question Answering.", "model_name": "uclanlp/visualbert-vqa"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "deepset/bert-base-cased-squad2", "api_call": "AutoModelForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')", "performance": {"dataset": "squad_v2", "accuracy": {"exact_match": 71.152, "f1": 74.671}}, "description": "This is a BERT base cased model trained on SQuAD v2", "model_name": "deepset/bert-base-cased-squad2"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "kan-bayashi_jvs_tts_finetune_jvs001_jsut_vits_raw_phn_jaconv_pyopenjta-truncated-178804", "api_call": "AutoModelForCausalLM.from_pretrained('espnet/kan-bayashi_jvs_tts_finetune_jvs001_jsut_vits_raw_phn_jaconv_pyopenjta-truncated-178804')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Japanese text-to-speech model trained using the ESPnet framework. It is designed to convert text input into natural-sounding speech.", "model_name": "espnet/kan-bayashi_jvs_tts_finetune_jvs001_jsut_vits_raw_phn_jaconv_pyopenjta-truncated-178804"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Transformers", "functionality": "Sentence Correction", "api_name": "flexudy/t5-base-multi-sentence-doctor", "api_call": "AutoModelWithLMHead.from_pretrained('flexudy/t5-base-multi-sentence-doctor')", "performance": {"dataset": "tatoeba", "accuracy": "Not specified"}, "description": "Sentence doctor is a T5 model that attempts to correct the errors or mistakes found in sentences. Model works on English, German and French text.", "model_name": "flexudy/t5-base-multi-sentence-doctor"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "microsoft/tapex-large-finetuned-wtq", "api_call": "BartForConditionalGeneration.from_pretrained('microsoft/tapex-large-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": "Not provided"}, "description": "TAPEX (Table Pre-training via Execution) is a conceptually simple and empirically powerful pre-training approach to empower existing models with table reasoning skills. TAPEX realizes table pre-training by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically synthesizing executable SQL queries. TAPEX is based on the BART architecture, the transformer encoder-decoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. This model is the tapex-base model fine-tuned on the WikiTableQuestions dataset.", "model_name": "microsoft/tapex-large-finetuned-wtq"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Image Generation", "api_name": "runwayml/stable-diffusion-inpainting", "api_call": "StableDiffusionInpaintPipeline.from_pretrained('runwayml/stable-diffusion-inpainting', revision=fp16, torch_dtype=torch.float16)", "performance": {"dataset": {"name": "LAION-2B (en)", "accuracy": "Not optimized for FID scores"}}, "description": "Stable Diffusion Inpainting is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input, with the extra capability of inpainting the pictures by using a mask.", "model_name": "runwayml/stable-diffusion-inpainting"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "uer/albert-base-chinese-cluecorpussmall", "api_call": "AlbertForMaskedLM.from_pretrained('uer/albert-base-chinese-cluecorpussmall')", "performance": {"dataset": "CLUECorpusSmall", "accuracy": "Not provided"}, "description": "This is the set of Chinese ALBERT models pre-trained by UER-py on the CLUECorpusSmall dataset. The model can be used for tasks like text generation and feature extraction.", "model_name": "uer/albert-base-chinese-cluecorpussmall"}
{"domain": "Multimodal Document Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "frizwankhan/entity-linking-model-final", "api_call": "pipeline('question-answering', model='frizwankhan/entity-linking-model-final')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Document Question Answering model based on layoutlmv2", "model_name": "frizwankhan/entity-linking-model-final"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "imdanboy/jets", "api_call": "pipeline('text-to-speech', model='imdanboy/jets')", "performance": {"dataset": "ljspeech", "accuracy": null}, "description": "This model was trained by imdanboy using ljspeech recipe in espnet.", "model_name": "imdanboy/jets"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/mask2former-swin-large-cityscapes-semantic", "api_call": "Mask2FormerForUniversalSegmentation.from_pretrained('facebook/mask2former-swin-large-cityscapes-semantic')", "performance": {"dataset": "Cityscapes", "accuracy": "Not specified"}, "description": "Mask2Former model trained on Cityscapes semantic segmentation (large-sized version, Swin backbone). It addresses instance, semantic and panoptic segmentation by predicting a set of masks and corresponding labels. The model outperforms the previous SOTA, MaskFormer, in terms of performance and efficiency.", "model_name": "facebook/mask2former-swin-large-cityscapes-semantic"}
{"domain": "Computer Vision Image-to-Image", "framework": "Diffusers", "functionality": "Text-to-Image Diffusion Models", "api_name": "lllyasviel/control_v11p_sd15_openpose", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11p_sd15_openpose')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on openpose images.", "model_name": "lllyasviel/control_v11p_sd15_openpose"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "facebook/detr-resnet-50", "api_call": "DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')", "performance": {"dataset": "COCO 2017 validation", "accuracy": "42.0 AP"}, "description": "DEtection TRansformer (DETR) model trained end-to-end on COCO 2017 object detection (118k annotated images). It was introduced in the paper End-to-End Object Detection with Transformers by Carion et al. and first released in this repository.", "model_name": "facebook/detr-resnet-50"}
{"domain": "Natural Language Processing Token Classification", "framework": "Hugging Face Transformers", "functionality": "Named Entity Recognition", "api_name": "Jean-Baptiste/camembert-ner", "api_call": "AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner')", "performance": {"dataset": "wikiner-fr", "accuracy": {"overall_f1": 0.8914000000000001, "PER_f1": 0.9483, "ORG_f1": 0.8181, "LOC_f1": 0.8955000000000001, "MISC_f1": 0.8146}}, "description": "camembert-ner is a Named Entity Recognition (NER) model fine-tuned from camemBERT on the wikiner-fr dataset. It can recognize entities such as persons, organizations, locations, and miscellaneous entities.", "model_name": "Jean-Baptiste/camembert-ner"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "videomae-base-ssv2", "api_call": "VideoMAEForPreTraining.from_pretrained('MCG-NJU/videomae-base-short-ssv2')", "performance": {"dataset": "Something-Something-v2", "accuracy": ""}, "description": "VideoMAE is an extension of Masked Autoencoders (MAE) to video. The architecture of the model is very similar to that of a standard Vision Transformer (ViT), with a decoder on top for predicting pixel values for masked patches. Videos are presented to the model as a sequence of fixed-size patches (resolution 16x16), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds fixed sinus/cosinus position embeddings before feeding the sequence to the layers of the Transformer encoder. By pre-training the model, it learns an inner representation of videos that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled videos for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire video.", "model_name": "MCG-NJU/videomae-base-short-ssv2"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-MiniLM-L12-v1", "api_call": "SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')", "performance": {"dataset": [{"name": "Sentence Embeddings Benchmark", "url": "https://seb.sbert.net"}], "accuracy": "Not provided"}, "description": "This is a sentence-transformers model that maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-MiniLM-L12-v1"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-221215-093747", "api_call": "AutoModel.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-221215-093747')", "performance": {"dataset": "DIODE", "accuracy": ""}, "description": "A depth estimation model fine-tuned on the DIODE dataset.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-221215-093747"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "google/flan-t5-xl", "api_call": "T5ForConditionalGeneration.from_pretrained('google/flan-t5-xl')", "performance": {"dataset": [{"name": "MMLU", "accuracy": "75.2%"}]}, "description": "FLAN-T5 XL is a large-scale language model fine-tuned on more than 1000 tasks covering multiple languages. It achieves state-of-the-art performance on several benchmarks and is designed for research on zero-shot and few-shot NLP tasks, such as reasoning, question answering, and understanding the limitations of current large language models.", "model_name": "google/flan-t5-xl"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Diffusers", "api_name": "Minecraft-Skin-Diffusion", "api_call": "DDPMPipeline.from_pretrained('WiNE-iNEFF/Minecraft-Skin-Diffusion')", "performance": {"dataset": "", "accuracy": ""}, "description": "Unconditional Image Generation model for generating Minecraft skins using diffusion-based methods.", "model_name": "WiNE-iNEFF/Minecraft-Skin-Diffusion"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg')", "performance": {"dataset": "ImageNet-1k", "accuracy": "75.9%"}, "description": "A series of CLIP ConvNeXt-Large (w/ extra text depth, vision MLP head) models trained on LAION-2B (english), a subset of LAION-5B, using OpenCLIP. The models are trained at 256x256 image resolution and achieve a 75.9 top-1 zero-shot accuracy on ImageNet-1k.", "model_name": "laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Hugging Face Transformers", "functionality": "Unconditional Image Generation", "api_name": "utyug1/sd-class-butterflies-32", "api_call": "DDPMPipeline.from_pretrained('utyug1/sd-class-butterflies-32')", "performance": {"dataset": "Not specified", "accuracy": "Not specified"}, "description": "This model is a diffusion model for unconditional image generation of cute butterflies.", "model_name": "utyug1/sd-class-butterflies-32"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Masked Language Modeling", "api_name": "bert-large-cased", "api_call": "pipeline('fill-mask', model='bert-large-cased')", "performance": {"dataset": {"SQUAD 1.1": {"F1": 91.5, "EM": 84.8}, "Multi NLI": {"accuracy": 86.09}}}, "description": "BERT large model (cased) pretrained on English language using a masked language modeling (MLM) objective. It has 24 layers, 1024 hidden dimensions, 16 attention heads, and 336M parameters.", "model_name": "bert-large-cased"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/blenderbot-1B-distill", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('facebook/blenderbot-1B-distill')", "performance": {"dataset": "blended_skill_talk", "accuracy": "Not mentioned"}, "description": "BlenderBot-1B is a large-scale open-domain chatbot model that can engage in conversations, ask and answer questions, and display knowledge, empathy, and personality. This distilled version is smaller and faster than the original 9.4B parameter model, making it more accessible for use.", "model_name": "facebook/blenderbot-1B-distill"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Depth Estimation", "api_name": "glpn-nyu", "api_call": "GLPNForDepthEstimation.from_pretrained('vinvino02/glpn-nyu')", "performance": {"dataset": "NYUv2", "accuracy": "Not provided"}, "description": "Global-Local Path Networks (GLPN) model trained on NYUv2 for monocular depth estimation. It was introduced in the paper Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth by Kim et al. and first released in this repository.", "model_name": "vinvino02/glpn-nyu"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "fxmarty/resnet-tiny-beans", "api_call": "pipeline('image-classification', model='fxmarty/resnet-tiny-beans')", "performance": {"dataset": "beans", "accuracy": "Not provided"}, "description": "A model trained on the beans dataset, just for testing and having a really tiny model.", "model_name": "fxmarty/resnet-tiny-beans"}
{"domain": "Audio Audio-to-Audio", "framework": "Hugging Face Transformers", "functionality": "Asteroid", "api_name": "ConvTasNet_Libri2Mix_sepclean_16k", "api_call": "Asteroid('JorisCos/ConvTasNet_Libri2Mix_sepclean_16k')", "performance": {"dataset": "Libri2Mix", "accuracy": {"si_sdr": 15.2436713569, "si_sdr_imp": 15.2430341785, "sdr": 15.6681089196, "sdr_imp": 15.578229918, "sir": 25.2951007566, "sir_imp": 25.2052199213, "sar": 16.3076825902, "sar_imp": -51.6498996376, "stoi": 0.9394951175, "stoi_imp": 0.2264019274}}, "description": "This model was trained by Joris Cosentino using the librimix recipe in Asteroid. It was trained on the sep_clean task of the Libri2Mix dataset.", "model_name": "JorisCos/ConvTasNet_Libri2Mix_sepclean_16k"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "CLIPModel.from_pretrained('laion/CLIP-convnext_base_w-laion2B-s13B-b82K')", "api_call": "CLIPModel.from_pretrained('laion/CLIP-convnext_base_w-laion2B-s13B-b82K')", "performance": {"dataset": "ImageNet-1k", "accuracy": "70.8 - 71.7%"}, "description": "A series of CLIP ConvNeXt-Base (w/ wide embed dim) models trained on subsets LAION-5B using OpenCLIP. The models achieve between 70.8 and 71.7 zero-shot top-1 accuracy on ImageNet-1k. The models can be used for zero-shot image classification, image and text retrieval, and other related tasks.", "model_name": "laion/CLIP-convnext_base_w-laion2B-s13B-b82K"}
{"domain": "Computer Vision Unconditional Image Generation", "framework": "Transformers", "functionality": "Unconditional Image Generation", "api_name": "ceyda/butterfly_cropped_uniq1K_512", "api_call": "LightweightGAN.from_pretrained('ceyda/butterfly_cropped_uniq1K_512')", "performance": {"dataset": "huggan/smithsonian_butterflies_subset", "accuracy": "FID score on 100 images"}, "description": "Butterfly GAN model based on the paper 'Towards Faster and Stabilized GAN Training for High-fidelity Few-shot Image Synthesis'. The model is intended for fun and learning purposes. It was trained on 1000 images from the huggan/smithsonian_butterflies_subset dataset, with a focus on low data training as mentioned in the paper. The model generates high-quality butterfly images.", "model_name": "ceyda/butterfly_cropped_uniq1K_512"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "facebook/dino-vits8", "api_call": "ViTModel.from_pretrained('facebook/dino-vits8')", "performance": {"dataset": "imagenet-1k", "accuracy": null}, "description": "Vision Transformer (ViT) model trained using the DINO method. It was introduced in the paper Emerging Properties in Self-Supervised Vision Transformers by Mathilde Caron, Hugo Touvron, Ishan Misra, Herv\u00e9 J\u00e9gou, Julien Mairal, Piotr Bojanowski, Armand Joulin and first released in this repository.", "model_name": "facebook/dino-vits8"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": ["Translation", "Summarization", "Question Answering", "Text Classification", "Text Regression"], "api_name": "t5-small", "api_call": "T5Model.from_pretrained('t5-small')", "performance": {"dataset": "c4", "accuracy": "See research paper, Table 14 for full results"}, "description": "T5-Small is a Text-To-Text Transfer Transformer (T5) model with 60 million parameters. It is designed to perform a variety of NLP tasks, including machine translation, document summarization, question answering, and classification tasks. The model is pre-trained on the Colossal Clean Crawled Corpus (C4) and can be fine-tuned for specific tasks.", "model_name": "t5-small"}
{"domain": "Multimodal Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "rasa/LaBSE", "api_call": "AutoModel.from_pretrained('rasa/LaBSE')", "performance": {"dataset": "", "accuracy": ""}, "description": "LaBSE (Language-agnostic BERT Sentence Embedding) model for extracting sentence embeddings in multiple languages.", "model_name": "rasa/LaBSE"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/distilbert-base-nli-stsb-mean-tokens", "api_call": "SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "hkunlp/instructor-base", "api_call": "INSTRUCTOR('hkunlp/instructor-base')", "performance": {"dataset": "MTEB AmazonCounterfactualClassification (en)", "accuracy": 86.209}, "description": "Instructor is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor achieves state-of-the-art performance on 70 diverse embedding tasks.", "model_name": "hkunlp/instructor-base"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "microsoft/GODEL-v1_1-base-seq2seq", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('microsoft/GODEL-v1_1-base-seq2seq')", "performance": {"dataset": "Reddit discussion thread, instruction and knowledge grounded dialogs", "accuracy": "N/A"}, "description": "GODEL is a large-scale pre-trained model for goal-directed dialogs. It is parameterized with a Transformer-based encoder-decoder model and trained for response generation grounded in external text, which allows more effective fine-tuning on dialog tasks that require conditioning the response on information that is external to the current conversation (e.g., a retrieved document). The pre-trained model can be efficiently fine-tuned and adapted to accomplish a new dialog task with a handful of task-specific dialogs. The v1.1 model is trained on 551M multi-turn dialogs from Reddit discussion thread, and 5M instruction and knowledge grounded dialogs.", "model_name": "microsoft/GODEL-v1_1-base-seq2seq"}
{"domain": "Audio Voice Activity Detection", "framework": "pyannote.audio", "functionality": "Speaker diarization", "api_name": "johnislarry/cloned-pyannote-speaker-diarization-endpoint", "api_call": "Pipeline.from_pretrained('pyannote/speaker-diarization@2.1',use_auth_token='ACCESS_TOKEN_GOES_HERE')", "performance": {"dataset": [{"name": "AISHELL-4", "accuracy": {"DER%": 14.61, "FA%": 3.31, "Miss%": 4.35, "Conf%": 6.95}}, {"name": "AMI Mix-Headset only_words", "accuracy": {"DER%": 18.21, "FA%": 3.2800000000000002, "Miss%": 11.07, "Conf%": 3.87}}, {"name": "AMI Array1-01 only_words", "accuracy": {"DER%": 29.0, "FA%": 2.71, "Miss%": 21.61, "Conf%": 4.68}}, {"name": "CALLHOME Part2", "accuracy": {"DER%": 30.24, "FA%": 3.71, "Miss%": 16.86, "Conf%": 9.66}}, {"name": "DIHARD 3 Full", "accuracy": {"DER%": 20.99, "FA%": 4.25, "Miss%": 10.74, "Conf%": 6.0}}, {"name": "REPERE Phase 2", "accuracy": {"DER%": 12.62, "FA%": 1.55, "Miss%": 3.3, "Conf%": 7.76}}, {"name": "VoxConverse v0.0.2", "accuracy": {"DER%": 12.76, "FA%": 3.45, "Miss%": 3.85, "Conf%": 5.46}}]}, "description": "This API provides speaker diarization functionality using the pyannote.audio framework. It is capable of processing audio files and outputting speaker diarization results in RTTM format. The API supports providing the number of speakers, minimum and maximum number of speakers, and adjusting the segmentation onset threshold.", "model_name": "pyannote/speaker-diarization"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "kha-white/manga-ocr-base", "api_call": "pipeline('ocr', model='kha-white/manga-ocr-base')", "performance": {"dataset": "manga109s", "accuracy": ""}, "description": "Optical character recognition for Japanese text, with the main focus being Japanese manga. It uses Vision Encoder Decoder framework. Manga OCR can be used as a general purpose printed Japanese OCR, but its main goal was to provide a high quality text recognition, robust against various scenarios specific to manga: both vertical and horizontal text, text with furigana, text overlaid on images, wide variety of fonts and font styles, and low quality images.", "model_name": "kha-white/manga-ocr-base"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "Zixtrauce/BaekBot", "api_call": "pipeline('conversational', model='Zixtrauce/BaekBot')", "performance": {"dataset": "", "accuracy": ""}, "description": "BaekBot is a conversational model based on the GPT-2 architecture for text generation. It can be used for generating human-like responses in a chat-like environment.", "model_name": "Zixtrauce/BaekBot"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-medium-finetuned-sqa", "api_call": "pipeline('table-question-answering', model='google/tapas-medium-finetuned-sqa')", "performance": {"dataset": "msr_sqa", "accuracy": 0.6561}, "description": "TAPAS medium model fine-tuned on Sequential Question Answering (SQA). This model is pretrained on a large corpus of English data from Wikipedia and uses relative position embeddings. It can be used for answering questions related to a table in a conversational set-up.", "model_name": "google/tapas-medium-finetuned-sqa"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-66b", "api_call": "AutoModelForCausalLM.from_pretrained('facebook/opt-66b', torch_dtype=torch.float16)", "performance": {"dataset": "GPT-3", "accuracy": "roughly matched"}, "description": "OPT (Open Pre-trained Transformer) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, designed to enable reproducible and responsible research at scale. OPT models are trained to roughly match the performance and sizes of the GPT-3 class of models, while also applying the latest best practices in data collection and efficient training. The pretrained-only model can be used for prompting for evaluation of downstream tasks as well as text generation.", "model_name": "facebook/opt-66b"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Program Synthesis", "api_name": "Salesforce/codegen-350M-multi", "api_call": "AutoTokenizer.from_pretrained('Salesforce/codegen-350M-multi')", "performance": {"dataset": "HumanEval and MTPB", "accuracy": "Refer to the paper for accuracy details"}, "description": "CodeGen is a family of autoregressive language models for program synthesis. The checkpoint included in this repository is denoted as CodeGen-Multi 350M, where Multi means the model is initialized with CodeGen-NL 350M and further pre-trained on a dataset of multiple programming languages, and 350M refers to the number of trainable parameters. The model is capable of extracting features from given natural language and programming language texts, and calculating the likelihood of them. It is best at program synthesis, generating executable code given English prompts, and can complete partially-generated code as well.", "model_name": "Salesforce/codegen-350M-multi"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "vit_base_patch16_224.augreg2_in21k_ft_in1k", "api_call": "ViTForImageClassification.from_pretrained('timm/vit_base_patch16_224.augreg2_in21k_ft_in1k')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Vision Transformer model for image classification, pretrained on ImageNet-21k and fine-tuned on ImageNet-1k.", "model_name": "timm/vit_base_patch16_224.augreg2_in21k_ft_in1k"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "nikcheerla/nooks-amd-detection-v2-full", "api_call": "SentenceTransformer.from_pretrained('nikcheerla/nooks-amd-detection-v2-full')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model that maps sentences and paragraphs to a 768-dimensional dense vector space. It can be used for tasks like clustering or semantic search.", "model_name": "nikcheerla/nooks-amd-detection-v2-full"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "shi-labs/oneformer_ade20k_swin_tiny", "api_call": "OneFormerForUniversalSegmentation.from_pretrained('shi-labs/oneformer_ade20k_swin_tiny')", "performance": {"dataset": "ADE20k", "accuracy": "Not provided"}, "description": "OneFormer is the first multi-task universal image segmentation framework. It needs to be trained only once with a single universal architecture, a single model, and on a single dataset, to outperform existing specialized models across semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference, all with a single model.", "model_name": "shi-labs/oneformer_ade20k_swin_tiny"}
{"domain": "Multimodal Image-to-Text", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "google/deplot", "api_call": "Pix2StructForConditionalGeneration.from_pretrained('google/deplot')", "performance": {"dataset": "ChartQA", "accuracy": "24.0% improvement over finetuned SOTA"}, "description": "DePlot is a model that translates the image of a plot or chart to a linearized table. It decomposes the challenge of visual language reasoning into two steps: (1) plot-to-text translation, and (2) reasoning over the translated text. The output of DePlot can then be directly used to prompt a pretrained large language model (LLM), exploiting the few-shot reasoning capabilities of LLMs.", "model_name": "google/deplot"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "deep-reinforcement-learning", "api_name": "ppo-PongNoFrameskip-v4", "api_call": "load_from_hub(repo_id='sb3/ppo-PongNoFrameskip-v4',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "PongNoFrameskip-v4", "accuracy": "21.00 +/- 0.00"}, "description": "This is a trained model of a PPO agent playing PongNoFrameskip-v4 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/ppo-PongNoFrameskip-v4"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "xhyi/layoutlmv3_docvqa_t11c5000", "api_call": "pipeline('question-answering', model='xhyi/layoutlmv3_docvqa_t11c5000')", "performance": {"dataset": "DocVQA", "accuracy": ""}, "description": "LayoutLMv3 model trained for document question answering task.", "model_name": "xhyi/layoutlmv3_docvqa_t11c5000"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Embeddings", "api_name": "sentence-transformers/bert-base-nli-mean-tokens", "api_call": "SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')", "performance": {"dataset": "https://seb.sbert.net", "accuracy": "Not provided"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/bert-base-nli-mean-tokens"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "joeddav/xlm-roberta-large-xnli", "api_call": "XLMRobertaForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')", "performance": {"dataset": {"xnli": "56.6k", "multi_nli": "8.73k"}, "accuracy": "Not specified"}, "description": "This model takes xlm-roberta-large and fine-tunes it on a combination of NLI data in 15 languages. It is intended to be used for zero-shot text classification, such as with the Hugging Face ZeroShotClassificationPipeline.", "model_name": "joeddav/xlm-roberta-large-xnli"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "wav2vec2-xlsr-53-russian-emotion-recognition", "api_call": "Wav2Vec2Model.from_pretrained('facebook/wav2vec2-large-xlsr-53')", "performance": {"dataset": "Russian Emotional Speech Dialogs", "accuracy": "72%"}, "description": "A model trained to recognize emotions in Russian speech using wav2vec2. It can classify emotions such as anger, disgust, enthusiasm, fear, happiness, neutral, and sadness.", "model_name": "facebook/wav2vec2-large-xlsr-53"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "swin2SR-lightweight-x2-64", "api_call": "Swin2SRForConditionalGeneration.from_pretrained('condef/Swin2SR-lightweight-x2-64').", "performance": {"dataset": "", "accuracy": ""}, "description": "Swin2SR model that upscales images x2. It was introduced in the paper Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration by Conde et al. and first released in this repository. This model is intended for lightweight image super resolution.", "model_name": "condef/Swin2SR-lightweight-x2-64"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Diffusion-based text-to-image generation model", "api_name": "lllyasviel/control_v11e_sd15_ip2p", "api_call": "ControlNetModel.from_pretrained('lllyasviel/control_v11e_sd15_ip2p')", "performance": {"dataset": "Stable Diffusion v1-5", "accuracy": "Not provided"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on instruct pix2pix images.", "model_name": "lllyasviel/control_v11e_sd15_ip2p"}
{"domain": "Natural Language Processing Question Answering", "framework": "Transformers", "functionality": "Question Answering", "api_name": "monologg/koelectra-small-v2-distilled-korquad-384", "api_call": "pipeline('question-answering', model='monologg/koelectra-small-v2-distilled-korquad-384')", "performance": {"dataset": "KorQuAD", "accuracy": "Not provided"}, "description": "A Korean Question Answering model based on Electra and trained on the KorQuAD dataset.", "model_name": "monologg/koelectra-small-v2-distilled-korquad-384"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Generation", "api_name": "runwayml/stable-diffusion-v1-5", "api_call": "StableDiffusionPipeline.from_pretrained(runwayml/stable-diffusion-v1-5, torch_dtype=torch.float16)(prompt).images[0]", "performance": {"dataset": "COCO2017", "accuracy": "Not optimized for FID scores"}, "description": "Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input.", "model_name": "runwayml/stable-diffusion-v1-5"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Text-to-Text Generation", "api_name": "optimum/t5-small", "api_call": "ORTModelForSeq2SeqLM.from_pretrained('optimum/t5-small')", "performance": {"dataset": "c4", "accuracy": "N/A"}, "description": "T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which each task is converted into a text-to-text format. It can be used for translation, text-to-text generation, and summarization.", "model_name": "optimum/t5-small"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "superb/hubert-base-superb-er", "api_call": "pipeline('audio-classification', model='superb/hubert-base-superb-er')", "performance": {"dataset": "IEMOCAP", "accuracy": {"session1": 0.6492, "session2": 0.6359}}, "description": "Hubert-Base for Emotion Recognition is a ported version of S3PRL's Hubert for the SUPERB Emotion Recognition task. The base model is hubert-base-ls960, which is pretrained on 16kHz sampled speech audio. The model is used for predicting an emotion class for each utterance, and it is trained and evaluated on the IEMOCAP dataset.", "model_name": "superb/hubert-base-superb-er"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "vit_tiny_patch16_224.augreg_in21k_ft_in1k", "api_call": "timm.create_model('hf_hub:timm/vit_tiny_patch16_224.augreg_in21k_ft_in1k', pretrained=True)", "performance": {"dataset": "", "accuracy": ""}, "description": "A Vision Transformer model for image classification, pretrained on ImageNet-21k and fine-tuned on ImageNet-1k with augmentations and regularization.", "model_name": "timm/vit_tiny_patch16_224.augreg_in21k_ft_in1k"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "ivelin/donut-refexp-combined-v1", "api_call": "pipeline('visual-question-answering', model='ivelin/donut-refexp-combined-v1')", "performance": {"dataset": "ivelin/donut-refexp-combined-v1", "accuracy": "N/A"}, "description": "A visual question answering model that takes an image and a question as input and provides an answer based on the visual content of the image and the context of the question.", "model_name": "ivelin/donut-refexp-combined-v1"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "MCG-NJU/videomae-base-finetuned-kinetics", "api_call": "VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-base-finetuned-kinetics')", "performance": {"dataset": "Kinetics-400", "accuracy": {"top-1": 80.9, "top-5": 94.7}}, "description": "VideoMAE model pre-trained for 1600 epochs in a self-supervised way and fine-tuned in a supervised way on Kinetics-400. It was introduced in the paper VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training by Tong et al. and first released in this repository.", "model_name": "MCG-NJU/videomae-base-finetuned-kinetics"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "layoutlmv2-base-uncased_finetuned_docvqa", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa')", "performance": {"dataset": "None", "accuracy": {"Loss": 4.3167}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-base-uncased on the None dataset.", "model_name": "tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "allenai/cosmo-xl", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('allenai/cosmo-xl')", "performance": {"dataset": {"allenai/soda": "", "allenai/prosocial-dialog": ""}, "accuracy": ""}, "description": "COSMO is a conversation agent with greater generalizability on both in- and out-of-domain chitchat datasets (e.g., DailyDialog, BlendedSkillTalk). It is trained on two datasets: SODA and ProsocialDialog. COSMO is especially aiming to model natural human conversations. It can accept situation descriptions as well as instructions on what role it should play in the situation.", "model_name": "allenai/cosmo-xl"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Speech Recognition", "api_name": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese", "api_call": "SpeechRecognitionModel('jonatasgrosman/wav2vec2-large-xlsr-53-japanese')", "performance": {"dataset": "common_voice", "accuracy": {"WER": 81.8, "CER": 20.16}}, "description": "Fine-tuned XLSR-53 large model for speech recognition in Japanese. Trained on Common Voice 6.1, CSS10, and JSUT datasets. Make sure your speech input is sampled at 16kHz.", "model_name": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Token Classification", "api_name": "xlm-roberta-large-finetuned-conll03-english", "api_call": "AutoModelForTokenClassification.from_pretrained('xlm-roberta-large-finetuned-conll03-english')", "performance": {"dataset": "conll2003", "accuracy": "More information needed"}, "description": "The XLM-RoBERTa model is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data. This model is XLM-RoBERTa-large fine-tuned with the conll2003 dataset in English. It can be used for token classification tasks such as Named Entity Recognition (NER) and Part-of-Speech (PoS) tagging.", "model_name": "xlm-roberta-large-finetuned-conll03-english"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "Summarization", "api_name": "google/pegasus-large", "api_call": "pipeline('summarization', model='google/pegasus-large')", "performance": {"dataset": [{"name": "xsum", "accuracy": "47.60/24.83/39.64"}, {"name": "cnn_dailymail", "accuracy": "44.16/21.56/41.30"}, {"name": "newsroom", "accuracy": "45.98/34.20/42.18"}, {"name": "multi_news", "accuracy": "47.65/18.75/24.95"}, {"name": "gigaword", "accuracy": "39.65/20.47/36.76"}, {"name": "wikihow", "accuracy": "46.39/22.12/38.41"}, {"name": "reddit_tifu", "accuracy": "27.99/9.81/22.94"}, {"name": "big_patent", "accuracy": "52.29/33.08/41.66"}, {"name": "arxiv", "accuracy": "44.21/16.95/25.67"}, {"name": "pubmed", "accuracy": "45.97/20.15/28.25"}, {"name": "aeslc", "accuracy": "37.68/21.25/36.51"}, {"name": "billsum", "accuracy": "59.67/41.58/47.59"}]}, "description": "google/pegasus-large is a pre-trained model for abstractive text summarization based on the PEGASUS architecture. It is trained on a mixture of C4 and HugeNews datasets and uses a sentencepiece tokenizer that can encode newline characters. The model has been fine-tuned for various summarization tasks and achieves state-of-the-art performance on multiple benchmarks.", "model_name": "google/pegasus-large"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Conversational", "api_name": "Pi3141/DialoGPT-medium-elon-3", "api_call": "pipeline('text-generation', model='Pi3141/DialoGPT-medium-elon-3')", "performance": {"dataset": "Twitter tweets by Elon Musk", "accuracy": "N/A"}, "description": "DialoGPT model that talks like Elon Musk, trained on Twitter tweets by Elon Musk. This model will spew meaningless shit about 40% of the time. Trained on 8 epochs. But with a larger dataset this time. The AI can now use more emojis, I think.", "model_name": "Pi3141/DialoGPT-medium-elon-3"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "timm/mobilenetv3_large_100.ra_in1k", "api_call": "timm.create_model('mobilenetv3_large_100.ra_in1k', pretrained=True)", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "A MobileNet-v3 image classification model. Trained on ImageNet-1k in timm using recipe template described below. Recipe details: RandAugment RA recipe. Inspired by and evolved from EfficientNet RandAugment recipes. Published as B recipe in ResNet Strikes Back. RMSProp (TF 1.0 behaviour) optimizer, EMA weight averaging. Step (exponential decay w/ staircase) LR schedule with warmup.", "model_name": "timm/mobilenetv3_large_100.ra_in1k"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-nyu-finetuned-diode-230131-041708", "api_call": "AutoModelForImageClassification.from_pretrained('sayakpaul/glpn-nyu-finetuned-diode-230131-041708')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.4425, "Mae": 0.427, "Rmse": 0.6196, "Abs_Rel": 0.45430000000000004, "Log_Mae": 0.17320000000000002, "Log_Rmse": 0.2288, "Delta1": 0.37870000000000004, "Delta2": 0.6298, "Delta3": 0.8083}}, "description": "This model is a fine-tuned version of vinvino02/glpn-nyu on the diode-subset dataset. It is used for depth estimation in computer vision tasks.", "model_name": "sayakpaul/glpn-nyu-finetuned-diode-230131-041708"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8m-pothole-segmentation", "api_call": "YOLO('keremberke/yolov8m-pothole-segmentation')", "performance": {"dataset": "pothole-segmentation", "accuracy": {"mAP@0.5(box)": 0.858, "mAP@0.5(mask)": 0.895}}, "description": "A YOLOv8 model for pothole segmentation trained on keremberke/pothole-segmentation dataset. It can detect potholes in images and provide segmentation masks for the detected potholes.", "model_name": "keremberke/yolov8m-pothole-segmentation"}
{"domain": "Multimodal Document Question Answer", "framework": "Transformers", "functionality": "Document Question Answering", "api_name": "tiny-random-LayoutLMv3ForQuestionAnswering", "api_call": "LayoutLMv3ForQuestionAnswering.from_pretrained('hf-tiny-model-private/tiny-random-LayoutLMv3ForQuestionAnswering')", "performance": {"dataset": "", "accuracy": ""}, "description": "A tiny random LayoutLMv3 model for document question answering. Can be used with the Hugging Face Inference API.", "model_name": "hf-tiny-model-private/tiny-random-LayoutLMv3ForQuestionAnswering"}
{"domain": "Natural Language Processing Text Classification", "framework": "Transformers", "functionality": "Text Classification", "api_name": "joeddav/distilbert-base-uncased-go-emotions-student", "api_call": "pipeline('text-classification', model='joeddav/distilbert-base-uncased-go-emotions-student')", "performance": {"dataset": "go_emotions"}, "description": "This model is distilled from the zero-shot classification pipeline on the unlabeled GoEmotions dataset. It is primarily intended as a demo of how an expensive NLI-based zero-shot model can be distilled to a more efficient student, allowing a classifier to be trained with only unlabeled data.", "model_name": "joeddav/distilbert-base-uncased-go-emotions-student"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "tiny-wav2vec2-stable-ln", "api_call": "pipeline('automatic-speech-recognition', model='ybelkada/tiny-wav2vec2-stable-ln')", "performance": {"dataset": null, "accuracy": null}, "description": "A tiny wav2vec2 model for Automatic Speech Recognition", "model_name": "ybelkada/tiny-wav2vec2-stable-ln"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "deep-reinforcement-learning", "api_name": "ppo-seals-CartPole-v0", "api_call": "load_from_hub(repo_id='HumanCompatibleAI/ppo-seals-CartPole-v0',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "seals/CartPole-v0", "accuracy": "500.00 +/- 0.00"}, "description": "This is a trained model of a PPO agent playing seals/CartPole-v0 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "HumanCompatibleAI/ppo-seals-CartPole-v0"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "sultan/BioM-ELECTRA-Large-SQuAD2", "api_call": "pipeline('question-answering', model='sultan/BioM-ELECTRA-Large-SQuAD2')", "performance": {"dataset": "SQuAD2.0 Dev", "accuracy": {"exact": 84.3342036554, "f1": 87.4935424189}}, "description": "BioM-ELECTRA-Large-SQuAD2 is a fine-tuned version of BioM-ELECTRA-Large, which was pre-trained on PubMed Abstracts, on the SQuAD2.0 dataset. Fine-tuning the biomedical language model on the SQuAD dataset helps improve the score on the BioASQ challenge. This model is suitable for working with BioASQ or biomedical QA tasks.", "model_name": "sultan/BioM-ELECTRA-Large-SQuAD2"}
{"domain": "Natural Language Processing Translation", "framework": "PyTorch Transformers", "functionality": "text2text-generation", "api_name": "facebook/nllb-200-distilled-600M", "api_call": "pipeline('translation_xx_to_yy', model='facebook/nllb-200-distilled-600M')", "performance": {"dataset": "Flores-200", "accuracy": "BLEU, spBLEU, chrF++"}, "description": "NLLB-200 is a machine translation model primarily intended for research in machine translation, especially for low-resource languages. It allows for single sentence translation among 200 languages. The model was trained on general domain text data and is not intended to be used with domain specific texts, such as medical domain or legal domain. The model is not intended to be used for document translation.", "model_name": "facebook/nllb-200-distilled-600M"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "PyTorch Transformers", "functionality": "Table Question Answering", "api_name": "table-question-answering-tapas", "api_call": "pipeline('table-question-answering', model='Meena/table-question-answering-tapas')", "performance": {"dataset": [{"name": "SQA (Sequential Question Answering by Microsoft)", "accuracy": null}, {"name": "WTQ (Wiki Table Questions by Stanford University)", "accuracy": null}, {"name": "WikiSQL (by Salesforce)", "accuracy": null}]}, "description": "TAPAS, the model learns an inner representation of the English language used in tables and associated texts, which can then be used to extract features useful for downstream tasks such as answering questions about a table, or determining whether a sentence is entailed or refuted by the contents of a table. It is a BERT-based model specifically designed (and pre-trained) for answering questions about tabular data. TAPAS uses relative position embeddings and has 7 token types that encode tabular structure. It is pre-trained on the masked language modeling (MLM) objective on a large dataset comprising millions of tables from English Wikipedia and corresponding texts.", "model_name": "Meena/table-question-answering-tapas"}
{"domain": "Natural Language Processing Text Classification", "framework": "Hugging Face", "functionality": "Sentiment Analysis", "api_name": "cardiffnlp/twitter-xlm-roberta-base-sentiment", "api_call": "pipeline(sentiment-analysis, model='cardiffnlp/twitter-xlm-roberta-base-sentiment')", "performance": {"dataset": "Twitter", "accuracy": "Not provided"}, "description": "This is a multilingual XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis. The sentiment fine-tuning was done on 8 languages (Ar, En, Fr, De, Hi, It, Sp, Pt) but it can be used for more languages (see paper for details).", "model_name": "cardiffnlp/twitter-xlm-roberta-base-sentiment"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "glpn-kitti-finetuned-diode-221214-123047", "api_call": "pipeline('depth-estimation', model='sayakpaul/glpn-kitti-finetuned-diode-221214-123047')", "performance": {"dataset": "diode-subset", "accuracy": {"Loss": 0.3497, "Mae": 0.2847, "Rmse": 0.3977, "Abs Rel": 0.3477, "Log Mae": 0.1203, "Log Rmse": 0.1726, "Delta1": 0.5217, "Delta2": 0.8246, "Delta3": 0.9436}}, "description": "This model is a fine-tuned version of vinvino02/glpn-kitti on the diode-subset dataset. It is used for depth estimation in computer vision applications.", "model_name": "sayakpaul/glpn-kitti-finetuned-diode-221214-123047"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "nlpaueb/legal-bert-small-uncased", "api_call": "AutoModel.from_pretrained('nlpaueb/legal-bert-small-uncased')", "performance": {"dataset": "Legal Corpora", "accuracy": "Comparable to larger models"}, "description": "LEGAL-BERT is a family of BERT models for the legal domain, intended to assist legal NLP research, computational law, and legal technology applications. This is the light-weight version of BERT-BASE (33% the size of BERT-BASE) pre-trained from scratch on legal data, which achieves comparable performance to larger models, while being much more efficient (approximately 4 times faster) with a smaller environmental footprint.", "model_name": "nlpaueb/legal-bert-small-uncased"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "sileod/deberta-v3-base-tasksource-nli", "api_call": "AutoModelForSequenceClassification.from_pretrained('sileod/deberta-v3-base-tasksource-nli')", "performance": {"dataset": ["glue", "piqa", "sciq"], "accuracy": "70% on WNLI"}, "description": "DeBERTa-v3-base fine-tuned with multi-task learning on 520 tasks of the tasksource collection. This checkpoint has strong zero-shot validation performance on many tasks, and can be used for zero-shot NLI pipeline (similar to bart-mnli but better).", "model_name": "sileod/deberta-v3-base-tasksource-nli"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "prompthero/openjourney-v4", "api_call": "pipeline('text-to-image', model='prompthero/openjourney-v4')", "performance": {"dataset": "Midjourney v4 images", "accuracy": "Not provided"}, "description": "Openjourney v4 is trained on +124k Midjourney v4 images by PromptHero. It is used for generating images based on text inputs.", "model_name": "prompthero/openjourney-v4"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Transformers", "functionality": "Zero-Shot Classification", "api_name": "valhalla/distilbart-mnli-12-3", "api_call": "pipeline('zero-shot-classification', model='valhalla/distilbart-mnli-12-3')", "performance": {"dataset": [{"name": "matched acc", "accuracy": 88.1}, {"name": "mismatched acc", "accuracy": 88.19}]}, "description": "distilbart-mnli is the distilled version of bart-large-mnli created using the No Teacher Distillation technique proposed for BART summarisation by Huggingface. It is a simple and effective technique with very little performance drop.", "model_name": "valhalla/distilbart-mnli-12-3"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Classification", "api_name": "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", "api_call": "DeBERTaModel.from_pretrained('MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli')", "performance": {"dataset": {"mnli-m": 0.903, "mnli-mm": 0.903, "fever-nli": 0.777, "anli-all": 0.579, "anli-r3": 0.495}, "accuracy": {"mnli-m": 0.903, "mnli-mm": 0.903, "fever-nli": 0.777, "anli-all": 0.579, "anli-r3": 0.495}}, "description": "This model was trained on the MultiNLI, Fever-NLI and Adversarial-NLI (ANLI) datasets, which comprise 763 913 NLI hypothesis-premise pairs. This base model outperforms almost all large models on the ANLI benchmark. The base model is DeBERTa-v3-base from Microsoft. The v3 variant of DeBERTa substantially outperforms previous versions of the model by including a different pre-training objective, see annex 11 of the original DeBERTa paper.", "model_name": "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/mask2former-swin-small-coco-instance", "api_call": "Mask2FormerForUniversalSegmentation.from_pretrained('facebook/mask2former-swin-small-coco-instance')", "performance": {"dataset": "COCO", "accuracy": "Not provided"}, "description": "Mask2Former model trained on COCO instance segmentation (small-sized version, Swin backbone). It was introduced in the paper Masked-attention Mask Transformer for Universal Image Segmentation and first released in this repository. Mask2Former addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. Hence, all 3 tasks are treated as if they were instance segmentation. Mask2Former outperforms the previous SOTA, MaskFormer both in terms of performance an efficiency.", "model_name": "facebook/mask2former-swin-small-coco-instance"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Emotion Recognition", "api_name": "superb/hubert-large-superb-er", "api_call": "pipeline('audio-classification', model='superb/hubert-large-superb-er')", "performance": {"dataset": "IEMOCAP", "accuracy": 0.6762}, "description": "This is a ported version of S3PRL's Hubert for the SUPERB Emotion Recognition task. The base model is hubert-large-ll60k, which is pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. For more information refer to SUPERB: Speech processing Universal PERformance Benchmark.", "model_name": "superb/hubert-large-superb-er"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Code Understanding and Generation", "api_name": "Salesforce/codet5-base", "api_call": "T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')", "performance": {"dataset": "code_search_net", "accuracy": "Refer to the paper for evaluation results on several downstream benchmarks"}, "description": "CodeT5 is a unified pre-trained encoder-decoder Transformer model that better leverages the code semantics conveyed from the developer-assigned identifiers. It supports both code understanding and generation tasks and allows for multi-task learning. The model can be used for tasks such as code summarization, code generation, code translation, code refinement, code defect detection, and code clone detection.", "model_name": "Salesforce/codet5-base"}
{"domain": "Computer Vision Image-to-Image", "framework": "Keras", "functionality": "Image Deblurring", "api_name": "google/maxim-s3-deblurring-gopro", "api_call": "from_pretrained_keras('google/maxim-s3-deblurring-gopro')", "performance": {"dataset": "GoPro", "accuracy": {"PSNR": 32.86, "SSIM": 0.961}}, "description": "MAXIM model pre-trained for image deblurring. It was introduced in the paper MAXIM: Multi-Axis MLP for Image Processing by Zhengzhong Tu, Hossein Talebi, Han Zhang, Feng Yang, Peyman Milanfar, Alan Bovik, Yinxiao Li and first released in this repository.", "model_name": "google/maxim-s3-deblurring-gopro"}
{"domain": "Natural Language Processing Sentence Similarity", "framework": "Hugging Face Transformers", "functionality": "Sentence Transformers", "api_name": "sentence-transformers/all-MiniLM-L6-v2", "api_call": "SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')", "performance": {"dataset": "1B sentence pairs dataset", "accuracy": "https://seb.sbert.net"}, "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_name": "sentence-transformers/all-MiniLM-L6-v2"}
{"domain": "Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "mazkooleg/0-9up-unispeech-sat-base-ft", "api_call": "pipeline('audio-classification', model='mazkooleg/0-9up-unispeech-sat-base-ft')", "performance": {"dataset": "mazkooleg/0-9up_google_speech_commands_augmented_raw", "accuracy": 0.9979}, "description": "This model is a fine-tuned version of microsoft/unispeech-sat-base on the None dataset. It achieves the following results on the evaluation set: Loss: 0.0123, Accuracy: 0.9979.", "model_name": "mazkooleg/0-9up-unispeech-sat-base-ft"}
{"domain": "Natural Language Processing Fill-Mask", "framework": "Transformers", "functionality": "Fill-Mask", "api_name": "cl-tohoku/bert-base-japanese", "api_call": "AutoModelForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese')", "performance": {"dataset": "wikipedia", "accuracy": "N/A"}, "description": "This is a BERT model pretrained on texts in the Japanese language. This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization.", "model_name": "cl-tohoku/bert-base-japanese"}
{"domain": "Natural Language Processing Text2Text Generation", "framework": "Hugging Face Transformers", "functionality": "Language model", "api_name": "google/flan-t5-base", "api_call": "T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')", "performance": {"dataset": [{"name": "MMLU", "accuracy": "75.2%"}]}, "description": "FLAN-T5 is a language model fine-tuned on more than 1000 additional tasks covering multiple languages. It achieves state-of-the-art performance on several benchmarks and is designed for research on zero-shot NLP tasks and in-context few-shot learning NLP tasks, such as reasoning, and question answering.", "model_name": "google/flan-t5-base"}
{"domain": "Computer Vision Object Detection", "framework": "Hugging Face Transformers", "functionality": "Object Detection", "api_name": "keremberke/yolov8m-valorant-detection", "api_call": "YOLO('keremberke/yolov8m-valorant-detection')", "performance": {"dataset": "valorant-object-detection", "accuracy": 0.965}, "description": "A YOLOv8 model for object detection in Valorant game, trained on a custom dataset. It detects dropped spike, enemy, planted spike, and teammate objects.", "model_name": "keremberke/yolov8m-valorant-detection"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face Transformers", "functionality": "Zero-Shot Image Classification", "api_name": "OFA-Sys/chinese-clip-vit-base-patch16", "api_call": "ChineseCLIPModel.from_pretrained('OFA-Sys/chinese-clip-vit-base-patch16')", "performance": {"dataset": {"MUGE Text-to-Image Retrieval": {"accuracy": {"Zero-shot R@1": 63.0, "Zero-shot R@5": 84.1, "Zero-shot R@10": 89.2, "Finetune R@1": 68.9, "Finetune R@5": 88.7, "Finetune R@10": 93.1}}, "Flickr30K-CN Retrieval": {"accuracy": {"Zero-shot Text-to-Image R@1": 71.2, "Zero-shot Text-to-Image R@5": 91.4, "Zero-shot Text-to-Image R@10": 95.5, "Finetune Text-to-Image R@1": 83.8, "Finetune Text-to-Image R@5": 96.9, "Finetune Text-to-Image R@10": 98.6, "Zero-shot Image-to-Text R@1": 81.6, "Zero-shot Image-to-Text R@5": 97.5, "Zero-shot Image-to-Text R@10": 98.8, "Finetune Image-to-Text R@1": 95.3, "Finetune Image-to-Text R@5": 99.7, "Finetune Image-to-Text R@10": 100.0}}, "COCO-CN Retrieval": {"accuracy": {"Zero-shot Text-to-Image R@1": 69.2, "Zero-shot Text-to-Image R@5": 89.9, "Zero-shot Text-to-Image R@10": 96.1, "Finetune Text-to-Image R@1": 81.5, "Finetune Text-to-Image R@5": 96.9, "Finetune Text-to-Image R@10": 99.1, "Zero-shot Image-to-Text R@1": 63.0, "Zero-shot Image-to-Text R@5": 86.6, "Zero-shot Image-to-Text R@10": 92.9, "Finetune Image-to-Text R@1": 83.5, "Finetune Image-to-Text R@5": 97.3, "Finetune Image-to-Text R@10": 99.2}}, "Zero-shot Image Classification": {"accuracy": {"CIFAR10": 96.0, "CIFAR100": 79.7, "DTD": 51.2, "EuroSAT": 52.0, "FER": 55.1, "FGVC": 26.2, "KITTI": 49.9, "MNIST": 79.4, "PC": 63.5, "VOC": 84.9}}}}, "description": "Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. It uses ViT-B/16 as the image encoder and RoBERTa-wwm-base as the text encoder.", "model_name": "OFA-Sys/chinese-clip-vit-base-patch16"}
{"domain": "Natural Language Processing Text Generation", "framework": "Transformers", "functionality": "Text Generation", "api_name": "facebook/opt-125m", "api_call": "pipeline('text-generation', model='facebook/opt-125m')", "performance": {"dataset": "Various", "accuracy": "Roughly matches GPT-3 performance"}, "description": "OPT (Open Pre-trained Transformers) is a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, designed to enable reproducible and responsible research at scale. It was predominantly pretrained with English text, but a small amount of non-English data is present within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM) objective. OPT can be used for prompting for evaluation of downstream tasks as well as text generation.", "model_name": "facebook/opt-125m"}
{"domain": "Computer Vision Video Classification", "framework": "Hugging Face Transformers", "functionality": "Video Classification", "api_name": "videomae-base-finetuned-ucf101-subset", "api_call": "AutoModelForSequenceClassification.from_pretrained('zahrav/videomae-base-finetuned-ucf101-subset')", "performance": {"dataset": "unknown", "accuracy": 0.8968}, "description": "This model is a fine-tuned version of MCG-NJU/videomae-base on an unknown dataset. It is used for video classification tasks.", "model_name": "zahrav/videomae-base-finetuned-ucf101-subset"}
{"domain": "Natural Language Processing Summarization", "framework": "Hugging Face Transformers", "functionality": "text2text-generation", "api_name": "tuner007/pegasus_summarizer", "api_call": "PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_summarizer')", "performance": {"dataset": "cnn_dailymail", "accuracy": {"ROUGE-1": 36.604, "ROUGE-2": 14.64, "ROUGE-L": 23.884, "ROUGE-LSUM": 32.902, "loss": 2.576, "gen_len": 76.398}}, "description": "PEGASUS fine-tuned for summarization", "model_name": "tuner007/pegasus_summarizer"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-base-finetuned-wtq", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')", "performance": {"dataset": "wikitablequestions", "accuracy": 0.46380000000000005}, "description": "TAPAS base model fine-tuned on WikiTable Questions (WTQ). This model is pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion, and then fine-tuned on SQA, WikiSQL, and finally WTQ. It can be used for answering questions related to a table.", "model_name": "google/tapas-base-finetuned-wtq"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-en-it", "api_call": "pipeline('translation_en_to_it', model='Helsinki-NLP/opus-mt-en-it')", "performance": {"dataset": "opus", "accuracy": {"newssyscomb2009.en.it": {"BLEU": 30.9, "chr-F": 0.606}, "newstest2009.en.it": {"BLEU": 31.9, "chr-F": 0.604}, "Tatoeba.en.it": {"BLEU": 48.2, "chr-F": 0.6950000000000001}}}, "description": "A Transformer-based English to Italian translation model trained on the OPUS dataset. This model can be used for translation tasks using the Hugging Face Transformers library.", "model_name": "Helsinki-NLP/opus-mt-en-it"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Speech Emotion Recognition", "api_name": "harshit345/xlsr-wav2vec-speech-emotion-recognition", "api_call": "Wav2Vec2ForSpeechClassification.from_pretrained('harshit345/xlsr-wav2vec-speech-emotion-recognition')", "performance": {"dataset": "JTES v1.1", "accuracy": {"anger": 0.8200000000000001, "disgust": 0.85, "fear": 0.78, "happiness": 0.84, "sadness": 0.86, "overall": 0.806}}, "description": "This model is trained on the JTES v1.1 dataset for speech emotion recognition. It uses the Wav2Vec2 architecture for audio classification and can recognize emotions like anger, disgust, fear, happiness, and sadness.", "model_name": "harshit345/xlsr-wav2vec-speech-emotion-recognition"}
{"domain": "Natural Language Processing Question Answering", "framework": "Hugging Face Transformers", "functionality": "Question Answering", "api_name": "valhalla/longformer-base-4096-finetuned-squadv1", "api_call": "AutoModelForQuestionAnswering.from_pretrained('valhalla/longformer-base-4096-finetuned-squadv1')", "performance": {"dataset": "squad_v1", "accuracy": {"Exact Match": 85.1466, "F1": 91.5415}}, "description": "This is longformer-base-4096 model fine-tuned on SQuAD v1 dataset for question answering task. Longformer model created by Iz Beltagy, Matthew E. Peters, Arman Coha from AllenAI. As the paper explains it, Longformer is a BERT-like model for long documents. The pre-trained model can handle sequences with up to 4096 tokens.", "model_name": "valhalla/longformer-base-4096-finetuned-squadv1"}
{"domain": "Natural Language Processing Feature Extraction", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "YituTech/conv-bert-base", "api_call": "AutoModel.from_pretrained('YituTech/conv-bert-base')", "performance": {"dataset": "N/A", "accuracy": "N/A"}, "description": "A pre-trained ConvBERT model for feature extraction provided by YituTech, based on the Hugging Face Transformers library.", "model_name": "YituTech/conv-bert-base"}
{"domain": "Tabular Tabular Classification", "framework": "Scikit-learn", "functionality": "Classification", "api_name": "imodels/figs-compas-recidivism", "api_call": "joblib.load(cached_download(hf_hub_url('imodels/figs-compas-recidivism', 'sklearn_model.joblib')))", "performance": {"dataset": "imodels/compas-recidivism", "accuracy": 0.6759165485}, "description": "A tabular classification model for predicting recidivism using the COMPAS dataset. The model is an imodels.FIGSClassifier trained with Scikit-learn and can be used with the Hugging Face Inference API.", "model_name": "imodels/figs-compas-recidivism"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "layoutlmv2-base-uncased-finetuned-docvqa", "api_call": "AutoModelForDocumentQuestionAnswering.from_pretrained('tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa')", "performance": {"dataset": "unknown", "accuracy": {"Loss": 1.194}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-base-uncased on an unknown dataset.", "model_name": "tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/maskformer-swin-base-coco", "api_call": "MaskFormerForInstanceSegmentation.from_pretrained('facebook/maskformer-swin-base-coco')", "performance": {"dataset": "COCO", "accuracy": "Not provided"}, "description": "MaskFormer model trained on COCO panoptic segmentation (base-sized version, Swin backbone). It was introduced in the paper Per-Pixel Classification is Not All You Need for Semantic Segmentation and first released in this repository.", "model_name": "facebook/maskformer-swin-base-coco"}
{"domain": "Multimodal Document Question Answer", "framework": "Hugging Face Transformers", "functionality": "Document Question Answering", "api_name": "layoutlmv2-base-uncased_finetuned_docvqa", "api_call": "AutoModel.from_pretrained('microsoft/layoutlmv2-base-uncased')", "performance": {"dataset": "None", "accuracy": {"Loss": 4.843}}, "description": "This model is a fine-tuned version of microsoft/layoutlmv2-base-uncased on the None dataset.", "model_name": "microsoft/layoutlmv2-base-uncased"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "superb/wav2vec2-base-superb-sid", "api_call": "pipeline('audio-classification', model='superb/wav2vec2-base-superb-sid')", "performance": {"dataset": "VoxCeleb1", "accuracy": 0.7518}, "description": "This is a ported version of S3PRL's Wav2Vec2 for the SUPERB Speaker Identification task. The base model is wav2vec2-base, which is pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. For more information refer to SUPERB: Speech processing Universal PERformance Benchmark.", "model_name": "superb/wav2vec2-base-superb-sid"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Text Generation", "api_name": "bigscience/bloom-7b1", "api_call": "pipeline('text-generation', model='bigscience/bloom-7b1') should be changed to TextGenerationPipeline(model=Bloom7b1Model.from_pretrained('bigscience/bloom-7b1')).", "performance": {"dataset": "Training Data", "accuracy": {"Training Loss": 2.3, "Validation Loss": 2.9, "Perplexity": 16}}, "description": "BigScience Large Open-science Open-access Multilingual Language Model (BLOOM) is a transformer-based language model designed for text generation and as a pretrained base model for fine-tuning on specific tasks. It supports 48 languages and has 7,069,016,064 parameters. The model is trained on a diverse corpus containing 45 natural languages, 12 programming languages, and 1.5TB of pre-processed text.", "model_name": "bigscience/bloom-7b1"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Neural machine translation", "api_name": "opus-mt-tc-big-en-pt", "api_call": "MarianMTModel.from_pretrained('pytorch-models/opus-mt-tc-big-en-pt')", "performance": {"dataset": [{"name": "flores101-devtest", "accuracy": 50.4}, {"name": "tatoeba-test-v2021-08-07", "accuracy": 49.6}]}, "description": "Neural machine translation model for translating from English (en) to Portuguese (pt). This model is part of the OPUS-MT project, an effort to make neural machine translation models widely available and accessible for many languages in the world.", "model_name": "pytorch-models/opus-mt-tc-big-en-pt"}
{"domain": "Reinforcement Learning", "framework": "Stable-Baselines3", "functionality": "CartPole-v1", "api_name": "sb3/ppo-CartPole-v1", "api_call": "load_from_hub(repo_id='sb3/ppo-CartPole-v1',filename='{MODEL FILENAME}.zip',)", "performance": {"dataset": "CartPole-v1", "accuracy": "500.00 +/- 0.00"}, "description": "This is a trained model of a PPO agent playing CartPole-v1 using the stable-baselines3 library and the RL Zoo. The RL Zoo is a training framework for Stable Baselines3 reinforcement learning agents, with hyperparameter optimization and pre-trained agents included.", "model_name": "sb3/ppo-CartPole-v1"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "microsoft/resnet-50", "api_call": "ResNetForImageClassification.from_pretrained('microsoft/resnet-50')", "performance": {"dataset": "imagenet-1k", "accuracy": "~0.5% top1"}, "description": "ResNet-50 v1.5 is a pre-trained convolutional neural network for image classification on the ImageNet-1k dataset at resolution 224x224. It was introduced in the paper Deep Residual Learning for Image Recognition by He et al. ResNet (Residual Network) democratized the concepts of residual learning and skip connections, enabling the training of much deeper models. ResNet-50 v1.5 differs from the original model in the bottleneck blocks which require downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution. This difference makes ResNet50 v1.5 slightly more accurate but comes with a small performance drawback.", "model_name": "microsoft/resnet-50"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "text2text-generation", "api_name": "neulab/omnitab-large-1024shot-finetuned-wtq-1024shot", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('neulab/omnitab-large-1024shot-finetuned-wtq-1024shot')", "performance": {"dataset": "wikitablequestions", "accuracy": "Not provided"}, "description": "OmniTab is a table-based QA model proposed in OmniTab: Pretraining with Natural and Synthetic Data for Few-shot Table-based Question Answering. The original Github repository is https://github.com/jzbjyb/OmniTab. neulab/omnitab-large-1024shot-finetuned-wtq-1024shot (based on BART architecture) is initialized with neulab/omnitab-large-1024shot and fine-tuned on WikiTableQuestions in the 1024-shot setting.", "model_name": "neulab/omnitab-large-1024shot-finetuned-wtq-1024shot"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face", "functionality": "Dialogue Response Generation", "api_name": "microsoft/DialoGPT-small", "api_call": "AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small')", "performance": {"dataset": "Reddit discussion thread", "accuracy": "Comparable to human response quality under a single-turn conversation Turing test"}, "description": "DialoGPT is a state-of-the-art large-scale pretrained dialogue response generation model for multiturn conversations. The model is trained on 147M multi-turn dialogue from Reddit discussion thread.", "model_name": "microsoft/DialoGPT-small"}
{"domain": "Audio Text-to-Speech", "framework": "Fairseq", "functionality": "Text-to-Speech", "api_name": "fastspeech2-en-male1", "api_call": "load_model_ensemble_and_task_from_hf_hub('facebook/fastspeech2-en-200_speaker-cv4',arg_overrides={'vocoder': 'hifigan', 'fp16': False})", "performance": {"dataset": "common_voice", "accuracy": null}, "description": "FastSpeech 2 text-to-speech model from fairseq S^2. English, 200 male/female voices, trained on Common Voice v4.", "model_name": "facebook/fastspeech2-en-200_speaker-cv4"}
{"domain": "Computer Vision Image Classification", "framework": "Hugging Face Transformers", "functionality": "Image Classification", "api_name": "google/mobilenet_v1_0.75_192", "api_call": "AutoModelForImageClassification.from_pretrained('google/mobilenet_v1_0.75_192')", "performance": {"dataset": "imagenet-1k", "accuracy": "Not provided"}, "description": "MobileNet V1 model pre-trained on ImageNet-1k at resolution 192x192. It was introduced in MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications by Howard et al, and first released in this repository. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as Inception, are used. MobileNets can be run efficiently on mobile devices.", "model_name": "google/mobilenet_v1_0.75_192"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "wav2vec2", "api_name": "facebook/wav2vec2-large-960h-lv60-self", "api_call": "Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-large-960h-lv60-self')", "performance": {"dataset": "librispeech_asr", "accuracy": {"clean": 1.9, "other": 3.9}}, "description": "Facebook's Wav2Vec2 model pretrained and fine-tuned on 960 hours of Libri-Light and Librispeech on 16kHz sampled speech audio. The model was trained with Self-Training objective. The model is used for Automatic Speech Recognition and can be used as a standalone acoustic model.", "model_name": "facebook/wav2vec2-large-960h-lv60-self"}
{"domain": "Natural Language Processing Token Classification", "framework": "Transformers", "functionality": "Part-of-speech tagging", "api_name": "ckiplab/bert-base-chinese-pos", "api_call": "AutoModel.from_pretrained('ckiplab/bert-base-chinese-pos')", "performance": {"dataset": "", "accuracy": ""}, "description": "This project provides traditional Chinese transformers models (including ALBERT, BERT, GPT2) and NLP tools (including word segmentation, part-of-speech tagging, named entity recognition).", "model_name": "ckiplab/bert-base-chinese-pos"}
{"domain": "Multimodal Text-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image", "api_name": "stabilityai/sd-vae-ft-ema", "api_call": "StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4', vae=AutoencoderKL.from_pretrained('stabilityai/sd-vae-ft-ema'))", "performance": {"dataset": {"COCO 2017 (256x256, val, 5000 images)": {"accuracy": {"rFID": 4.42, "PSNR": "23.8 +/- 3.9", "SSIM": "0.69 +/- 0.13", "PSIM": "0.96 +/- 0.27"}}, "LAION-Aesthetics 5+ (256x256, subset, 10000 images)": {"accuracy": {"rFID": 1.77, "PSNR": "26.7 +/- 4.8", "SSIM": "0.82 +/- 0.12", "PSIM": "0.67 +/- 0.34"}}}}, "description": "This is a fine-tuned VAE decoder for the Stable Diffusion Pipeline. It has been fine-tuned on a 1:1 ratio of LAION-Aesthetics and LAION-Humans datasets. The decoder can be used as a drop-in replacement for the existing autoencoder.", "model_name": "CompVis/stable-diffusion-v1-4"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "vilt-finetuned-vqasi", "api_call": "ViltModel.from_pretrained('tufa15nik/vilt-finetuned-vqasi')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Visual Question Answering model fine-tuned on the VQASI dataset by tufa15nik using the ViLT architecture. The model is designed to answer questions based on the content of an input image.", "model_name": "tufa15nik/vilt-finetuned-vqasi"}
{"domain": "Audio Text-to-Speech", "framework": "speechbrain", "functionality": "Text-to-Speech", "api_name": "tts-hifigan-german", "api_call": "HIFIGAN.from_hparams(source='padmalcom/tts-hifigan-german', savedir=tmpdir_vocoder)", "performance": {"dataset": "custom German dataset", "accuracy": "Not specified"}, "description": "A HiFIGAN vocoder trained on a generated German dataset using mp3_to_training_data. The pre-trained model takes in input a spectrogram and produces a waveform in output. Typically, a vocoder is used after a TTS model that converts an input text into a spectrogram.", "model_name": "padmalcom/tts-hifigan-german"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Transformers", "functionality": "Table Question Answering", "api_name": "google/tapas-mini-finetuned-sqa", "api_call": "TapasForQuestionAnswering.from_pretrained('google/tapas-mini-finetuned-sqa')", "performance": {"dataset": "msr_sqa", "accuracy": 0.5148}, "description": "TAPAS mini model fine-tuned on Sequential Question Answering (SQA)", "model_name": "google/tapas-mini-finetuned-sqa"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "facebook/mask2former-swin-tiny-coco-instance", "api_call": "Mask2FormerForUniversalSegmentation.from_pretrained('facebook/mask2former-swin-tiny-coco-instance')", "performance": {"dataset": "COCO", "accuracy": "Not specified"}, "description": "Mask2Former model trained on COCO instance segmentation (tiny-sized version, Swin backbone). It was introduced in the paper Masked-attention Mask Transformer for Universal Image Segmentation and first released in this repository. This model addresses instance, semantic and panoptic segmentation with the same paradigm: by predicting a set of masks and corresponding labels. You can use this particular checkpoint for instance segmentation.", "model_name": "facebook/mask2former-swin-tiny-coco-instance"}
{"domain": "Natural Language Processing Conversational", "framework": "Hugging Face Transformers", "functionality": "text-generation", "api_name": "pygmalion-1.3b", "api_call": "pipeline('text-generation', 'PygmalionAI/pygmalion-1.3b')", "performance": {"dataset": "56MB of dialogue data", "accuracy": "Not provided"}, "description": "Pygmalion 1.3B is a proof-of-concept dialogue model based on EleutherAI's pythia-1.3b-deduped. It is designed for generating conversational responses and can be used with a specific input format that includes character persona, dialogue history, and user input message.", "model_name": "PygmalionAI/pygmalion-1.3b"}
{"domain": "Computer Vision Zero-Shot Image Classification", "framework": "Hugging Face", "functionality": "Zero-Shot Image Classification", "api_name": "laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K", "api_call": "pipeline('image-classification', model='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K')", "performance": {"dataset": "ImageNet-1k", "accuracy": "70.8% to 71.7%"}, "description": "A series of CLIP ConvNeXt-Base (w/ wide embed dim) models trained on subsets LAION-5B using OpenCLIP. These models achieve between 70.8 and 71.7 zero-shot top-1 accuracy on ImageNet-1k. They can be used for zero-shot image classification, image and text retrieval, and other tasks.", "model_name": "laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K"}
{"domain": "Tabular Tabular Regression", "framework": "Scikit-learn", "functionality": "baseline-trainer", "api_name": "srg/outhimar_64-Close-regression", "api_call": "joblib.load(hf_hub_download('srg/outhimar_64-Close-regression', 'sklearn_model.joblib'))", "performance": {"dataset": "outhimar_64", "accuracy": {"r2": 0.9998579999999999, "neg_mean_squared_error": -1.067685}}, "description": "Baseline Model trained on outhimar_64 to apply regression on Close. Disclaimer: This model is trained with dabl library as a baseline, for better results, use AutoTrain. Logs of training including the models tried in the process can be found in logs.txt.", "model_name": "srg/outhimar_64-Close-regression"}
{"domain": "Computer Vision Image-to-Image", "framework": "Hugging Face", "functionality": "Text-to-Image Diffusion Models", "api_name": "lllyasviel/sd-controlnet-scribble", "api_call": "ControlNetModel.from_pretrained('lllyasviel/sd-controlnet-scribble')", "performance": {"dataset": "500k scribble-image, caption pairs", "accuracy": "Not provided"}, "description": "ControlNet is a neural network structure to control diffusion models by adding extra conditions. This checkpoint corresponds to the ControlNet conditioned on Scribble images. It can be used in combination with Stable Diffusion.", "model_name": "lllyasviel/sd-controlnet-scribble"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-it-en", "api_call": "pipeline('translation_it_to_en', model='Helsinki-NLP/opus-mt-it-en')", "performance": {"dataset": "opus", "accuracy": {"BLEU": {"newssyscomb2009.it.en": 35.3, "newstest2009.it.en": 34.0, "Tatoeba.it.en": 70.9}, "chr-F": {"newssyscomb2009.it.en": 0.6000000000000001, "newstest2009.it.en": 0.594, "Tatoeba.it.en": 0.808}}}, "description": "A transformer model for Italian to English translation trained on the OPUS dataset. It can be used for translating Italian text to English.", "model_name": "Helsinki-NLP/opus-mt-it-en"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Image Segmentation", "api_name": "keremberke/yolov8s-pothole-segmentation", "api_call": "YOLO('keremberke/yolov8s-pothole-segmentation')", "performance": {"dataset": "pothole-segmentation", "accuracy": {"mAP@0.5(box)": 0.928, "mAP@0.5(mask)": 0.928}}, "description": "A YOLOv8 model for pothole segmentation. This model detects potholes in images and outputs bounding boxes and masks for the detected potholes.", "model_name": "keremberke/yolov8s-pothole-segmentation"}
{"domain": "Audio Audio Classification", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "wav2vec2-base-superb-sv", "api_call": "AutoModelForAudioXVector.from_pretrained('anton-l/wav2vec2-base-superb-sv')", "performance": {"dataset": "superb", "accuracy": "More information needed"}, "description": "This is a ported version of S3PRL's Wav2Vec2 for the SUPERB Speaker Verification task. The base model is wav2vec2-large-lv60, which is pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. For more information refer to SUPERB: Speech processing Universal PERformance Benchmark.", "model_name": "anton-l/wav2vec2-base-superb-sv"}
{"domain": "Natural Language Processing Translation", "framework": "Transformers", "functionality": "Translation", "api_name": "opus-mt-fr-en", "api_call": "pipeline('translation_fr_to_en', model='Helsinki-NLP/opus-mt-fr-en')", "performance": {"dataset": "opus", "accuracy": {"BLEU": {"newsdiscussdev2015-enfr.fr.en": 33.1, "newsdiscusstest2015-enfr.fr.en": 38.7, "newssyscomb2009.fr.en": 30.3, "news-test2008.fr.en": 26.2, "newstest2009.fr.en": 30.2, "newstest2010.fr.en": 32.2, "newstest2011.fr.en": 33.0, "newstest2012.fr.en": 32.8, "newstest2013.fr.en": 33.9, "newstest2014-fren.fr.en": 37.8, "Tatoeba.fr.en": 57.5}}}, "description": "Helsinki-NLP/opus-mt-fr-en is a machine translation model trained to translate from French to English. It is based on the Marian NMT framework and trained on the OPUS dataset.", "model_name": "Helsinki-NLP/opus-mt-fr-en"}
{"domain": "Computer Vision Image Segmentation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "shi-labs/oneformer_coco_swin_large", "api_call": "'OneFormerForUniversalSegmentation.from_pretrained(shi-labs/oneformer_coco_swin_large)'", "performance": {"dataset": "ydshieh/coco_dataset_script", "accuracy": "Not provided"}, "description": "OneFormer model trained on the COCO dataset (large-sized version, Swin backbone). It was introduced in the paper OneFormer: One Transformer to Rule Universal Image Segmentation by Jain et al. and first released in this repository. OneFormer is the first multi-task universal image segmentation framework. It needs to be trained only once with a single universal architecture, a single model, and on a single dataset, to outperform existing specialized models across semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference, all with a single model.", "model_name": "shi-labs/oneformer_coco_swin_large"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "lysandre/tiny-tapas-random-wtq", "api_call": "TapasForQuestionAnswering.from_pretrained('lysandre/tiny-tapas-random-wtq')", "performance": {"dataset": "WTQ", "accuracy": "Not provided"}, "description": "A tiny TAPAS model trained on the WikiTableQuestions dataset for table question answering tasks.", "model_name": "lysandre/tiny-tapas-random-wtq"}
{"domain": "Computer Vision Depth Estimation", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "dpt-large-redesign", "api_call": "AutoModelForDepthEstimation.from_pretrained('nielsr/dpt-large-redesign')", "performance": {"dataset": "", "accuracy": ""}, "description": "A depth estimation model based on the DPT architecture.", "model_name": "nielsr/dpt-large-redesign"}
{"domain": "Natural Language Processing Translation", "framework": "Hugging Face Transformers", "functionality": "Translation", "api_name": "Helsinki-NLP/opus-mt-nl-en", "api_call": "pipeline('translation_nl_to_en', model='Helsinki-NLP/opus-mt-nl-en')", "performance": {"dataset": "Tatoeba.nl.en", "accuracy": {"BLEU": 60.9, "chr-F": 0.749}}, "description": "A Dutch to English translation model based on the OPUS dataset, using a transformer-align architecture with normalization and SentencePiece pre-processing.", "model_name": "Helsinki-NLP/opus-mt-nl-en"}
{"domain": "Natural Language Processing Summarization", "framework": "Transformers", "functionality": "text2text-generation", "api_name": "csebuetnlp/mT5_multilingual_XLSum", "api_call": "AutoModelForSeq2SeqLM.from_pretrained('csebuetnlp/mT5_multilingual_XLSum')", "performance": {"dataset": "xsum", "accuracy": {"ROUGE-1": 36.5, "ROUGE-2": 13.934, "ROUGE-L": 28.988, "ROUGE-LSUM": 28.996, "loss": 2.067, "gen_len": 26.973}}, "description": "This repository contains the mT5 checkpoint finetuned on the 45 languages of XL-Sum dataset. It is a multilingual abstractive summarization model that supports text-to-text generation for 43 languages.", "model_name": "csebuetnlp/mT5_multilingual_XLSum"}
{"domain": "Audio Automatic Speech Recognition", "framework": "Hugging Face Transformers", "functionality": "Transformers", "api_name": "vitouphy/wav2vec2-xls-r-300m-phoneme", "api_call": "Wav2Vec2ForCTC.from_pretrained('vitouphy/wav2vec2-xls-r-300m-phoneme')", "performance": {"dataset": "None", "accuracy": {"Loss": 0.3327, "Cer": 0.1332}}, "description": "This model is a fine-tuned version of facebook/wav2vec2-xls-r-300m on the None dataset. It is designed for Automatic Speech Recognition tasks.", "model_name": "vitouphy/wav2vec2-xls-r-300m-phoneme"}
{"domain": "Natural Language Processing Zero-Shot Classification", "framework": "Hugging Face Transformers", "functionality": "Natural Language Inference", "api_name": "cointegrated/rubert-base-cased-nli-threeway", "api_call": "AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-base-cased-nli-threeway')", "performance": {"dataset": ["JOCI", "MNLI", "MPE", "SICK", "SNLI", "ANLI", "NLI-style FEVER", "IMPPRES"], "accuracy": {"ROC AUC": {"entailment": 0.91, "contradiction": 0.71, "neutral": 0.79}}}, "description": "This is the DeepPavlov/rubert-base-cased fine-tuned to predict the logical relationship between two short texts: entailment, contradiction, or neutral.", "model_name": "cointegrated/rubert-base-cased-nli-threeway"}
{"domain": "Natural Language Processing Text Generation", "framework": "Hugging Face Transformers", "functionality": "Feature Extraction", "api_name": "sberbank-ai/sbert_large_mt_nlu_ru", "api_call": "AutoModel.from_pretrained('sberbank-ai/sbert_large_mt_nlu_ru')", "performance": {"dataset": "Russian SuperGLUE", "accuracy": "Not provided"}, "description": "BERT large model multitask (cased) for Sentence Embeddings in Russian language.", "model_name": "sberbank-ai/sbert_large_mt_nlu_ru"}
{"domain": "Multimodal Text-to-Video", "framework": "Hugging Face", "functionality": "Text-to-Video", "api_name": "camenduru/text2-video-zero", "api_call": "pipeline('text-to-video', model='camenduru/text2-video-zero')", "performance": {"dataset": "", "accuracy": ""}, "description": "This model is used for generating videos from text inputs. It is based on the Hugging Face framework and can be used with the transformers library. The model is trained on a variety of text and video datasets, and can be used for tasks such as video summarization, video generation from text prompts, and more.", "model_name": "camenduru/text2-video-zero"}
{"domain": "Natural Language Processing Table Question Answering", "framework": "Hugging Face Transformers", "functionality": "Table Question Answering", "api_name": "navteca/tapas-large-finetuned-wtq", "api_call": "AutoModelForTableQuestionAnswering.from_pretrained('navteca/tapas-large-finetuned-wtq')", "performance": {"dataset": "wikisql", "accuracy": "Not provided"}, "description": "TAPAS large model fine-tuned on WikiTable Questions (WTQ). It is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion. It can be used for answering questions related to a table.", "model_name": "navteca/tapas-large-finetuned-wtq"}
{"domain": "Multimodal Visual Question Answering", "framework": "Hugging Face", "functionality": "Visual Question Answering", "api_name": "sheldonxxxx/OFA_model_weights", "api_call": "AutoModel.from_pretrained('sheldonxxxx/OFA_model_weights')", "performance": {"dataset": "", "accuracy": ""}, "description": "This is an unoffical mirror of the model weights for use with https://github.com/OFA-Sys/OFA. The original link is too slow when downloading from outside of China.", "model_name": "sheldonxxxx/OFA_model_weights"}
{"domain": "Audio Voice Activity Detection", "framework": "Hugging Face", "functionality": "Voice Activity Detection", "api_name": "d4data/Indian-voice-cloning", "api_call": "pipeline('voice-activity-detection', model='d4data/Indian-voice-cloning')", "performance": {"dataset": "", "accuracy": ""}, "description": "A model for detecting voice activity in Indian languages.", "model_name": "d4data/Indian-voice-cloning"}
{"domain": "Tabular Tabular Classification", "framework": "Hugging Face", "functionality": "Carbon Emissions", "api_name": "Xinhhd/autotrain-zhongxin-contest-49402119333", "api_call": "AutoModel.from_pretrained('Xinhhd/autotrain-zhongxin-contest-49402119333')", "performance": {"dataset": "Xinhhd/autotrain-data-zhongxin-contest", "accuracy": 0.889}, "description": "A multi-class classification model trained with AutoTrain to predict carbon emissions based on input features.", "model_name": "Xinhhd/autotrain-zhongxin-contest-49402119333"}
{"domain": "Audio Text-to-Speech", "framework": "ESPnet", "functionality": "Text-to-Speech", "api_name": "SYSPIN/Telugu_Male_TTS", "api_call": "pipeline('text-to-speech', model='SYSPIN/Telugu_Male_TTS')", "performance": {"dataset": "", "accuracy": ""}, "description": "A Telugu Male Text-to-Speech model using the ESPnet framework, provided by Hugging Face.", "model_name": "SYSPIN/Telugu_Male_TTS"}
