How to get all hugging face models list using python?

1.3k Views Asked by At

Is there any way to get list of models available on Hugging Face? E.g. for Automatic Speech Recognition (ASR).

2

There are 2 best solutions below

1
On

Try something like:

import re

import requests
from bs4 import BeautifulSoup


def list_models(task, sort_by):
  """Returns first page of results from available models on huggingface.co"""
  url = f"https://huggingface.co/models?pipeline_tag={task}&sort={sort_by}"

  response = requests.get(url)
  soup = BeautifulSoup(response.content.decode('utf8'))

  for model in soup.find_all('article'):
    parsed_text = [line.strip() for line in re.sub(' +', ' ', model.text.replace('\n', ' ').replace('\t', ' ').replace('•', '\n')).strip().split('\n')]
    model_name_str, last_updated_str, downloaded, *liked = parsed_text
    liked = int(liked[0]) if liked else 0

    model_name = model.find('a').attrs['href'][1:]
    timestamp = model.find('time').attrs['datetime']
    yield {"model_name": model_name, "last_updated": timestamp, "downloaded": downloaded.strip(), "liked": liked}



task = "automatic-speech-recognition"
sort_by = "downloads"

list(list_models(task, sort_by))

[out]:

[{'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-english',
  'last_updated': '2022-12-14T02:02:32',
  'downloaded': '13.7M',
  'liked': 31},
 {'model_name': 'pyannote/voice-activity-detection',
  'last_updated': '2022-10-28T13:46:55',
  'downloaded': '1.06M',
  'liked': 37},
 {'model_name': 'pyannote/speaker-diarization',
  'last_updated': '2022-11-17T13:45:04',
  'downloaded': '855k',
  'liked': 171},
 {'model_name': 'facebook/wav2vec2-large-960h-lv60-self',
  'last_updated': '2022-05-23T16:13:42',
  'downloaded': '636k',
  'liked': 71},
 {'model_name': 'yongjian/wav2vec2-large-a',
  'last_updated': '2022-10-22T07:21:15',
  'downloaded': '272k',
  'liked': 5},
 {'model_name': 'jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli',
  'last_updated': '2022-02-25T19:07:57',
  'downloaded': '248k',
  'liked': 4},
 {'model_name': 'facebook/wav2vec2-base-960h',
  'last_updated': '2022-11-14T21:37:23',
  'downloaded': '222k',
  'liked': 109},
 {'model_name': 'openai/whisper-tiny.en',
  'last_updated': '2023-02-23T15:31:39',
  'downloaded': '66.4k',
  'liked': 35},
 {'model_name': 'facebook/wav2vec2-xlsr-53-espeak-cv-ft',
  'last_updated': '2021-12-10T17:18:39',
  'downloaded': '62.1k',
  'liked': 11},
 {'model_name': 'maxidl/wav2vec2-large-xlsr-german',
  'last_updated': '2021-07-06T12:32:21',
  'downloaded': '60k',
  'liked': 0},
 {'model_name': 'pyannote/overlapped-speech-detection',
  'last_updated': '2022-10-28T13:46:33',
  'downloaded': '45.9k',
  'liked': 4},
 {'model_name': 'openai/whisper-tiny',
  'last_updated': '2023-03-10T17:15:01',
  'downloaded': '43.6k',
  'liked': 41},
 {'model_name': 'ceyda/wav2vec2-base-760-turkish',
  'last_updated': '2021-07-06T00:16:04',
  'downloaded': '42.1k',
  'liked': 2},
 {'model_name': 'openai/whisper-large-v2',
  'last_updated': '2023-03-10T17:15:07',
  'downloaded': '41.7k',
  'liked': 275},
 {'model_name': 'openai/whisper-small',
  'last_updated': '2023-03-10T17:15:13',
  'downloaded': '39.7k',
  'liked': 37},
 {'model_name': 'techiaith/wav2vec2-xlsr-ft-en-cy',
  'last_updated': '2023-03-02T06:30:13',
  'downloaded': '32.5k',
  'liked': 1},
 {'model_name': 'facebook/wav2vec2-xlsr-53-phon-cv-babel-ft',
  'last_updated': '2021-11-10T12:02:20',
  'downloaded': '31.2k',
  'liked': 1},
 {'model_name': 'comodoro/wav2vec2-xls-r-300m-cs-250',
  'last_updated': '2022-03-23T18:26:50',
  'downloaded': '31.1k',
  'liked': 0},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-russian',
  'last_updated': '2022-12-14T01:58:43',
  'downloaded': '29.3k',
  'liked': 12},
 {'model_name': 'facebook/hubert-large-ls960-ft',
  'last_updated': '2022-05-24T10:43:42',
  'downloaded': '23.8k',
  'liked': 27},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-japanese',
  'last_updated': '2022-12-14T01:58:09',
  'downloaded': '21.7k',
  'liked': 9},
 {'model_name': 'openai/whisper-base',
  'last_updated': '2023-03-10T17:13:49',
  'downloaded': '20.5k',
  'liked': 52},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-dutch',
  'last_updated': '2022-12-14T01:58:20',
  'downloaded': '20.1k',
  'liked': 0},
 {'model_name': 'openai/whisper-medium',
  'last_updated': '2023-03-10T17:15:08',
  'downloaded': '20k',
  'liked': 40},
 {'model_name': 'jonatasgrosman/wav2vec2-large-xlsr-53-portuguese',
  'last_updated': '2022-12-14T01:59:47',
  'downloaded': '19.2k',
  'liked': 11},
 {'model_name': 'facebook/wav2vec2-large-960h',
  'last_updated': '2022-04-05T16:40:42',
  'downloaded': '15.5k',
  'liked': 14},
 {'model_name': 'facebook/data2vec-audio-base-960h',
  'last_updated': '2022-05-24T10:41:22',
  'downloaded': '15.4k',
  'liked': 7},
 {'model_name': 'theainerd/Wav2Vec2-large-xlsr-hindi',
  'last_updated': '2023-03-20T05:28:11',
  'downloaded': '15.3k',
  'liked': 2},
 {'model_name': 'tyoc213/wav2vec2-large-xlsr-nahuatl',
  'last_updated': '2021-04-07T02:59:04',
  'downloaded': '12.5k',
  'liked': 1},
 {'model_name': 'openai/whisper-large',
  'last_updated': '2023-03-10T17:15:11',
  'downloaded': '12k',
  'liked': 228}]

Also posted a feature request on https://discuss.huggingface.co/t/feature-request-listing-available-models-datasets-and-metrics/34389

0
On

You can use huggingface_hub with list_models and a ModelFilter:

from huggingface_hub import HfApi, ModelFilter
api = HfApi()
models = api.list_models(
    filter=ModelFilter(
        task="automatic-speech-recognition"
    )
)
models = list(models)
print(len(models))
print(models[0].modelId)

Output:

7195
13048909972/wav2vec2-common_voice-tr-demo