models : add usage comments to the HF convert script (#157)

This commit is contained in:
Georgi Gerganov 2022-11-23 23:22:40 +02:00
parent 5698bddbc9
commit 00f46dbc1d
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 118 additions and 101 deletions

View File

@ -1,3 +1,20 @@
# Convert Hugging Face fine-tuned models to ggml format
#
# Usage:
#
# git clone https://github.com/openai/whisper
# git clone https://github.com/ggerganov/whisper.cpp
# git clone https://huggingface.co/openai/whisper-medium
#
# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
#
# This script is similar to "convert-pt-to-ggml.py"
#
# For more info:
#
# https://github.com/ggerganov/whisper.cpp/issues/157
#
import io
import os
import sys

View File

@ -44,107 +44,107 @@ import numpy as np
#from transformers import GPT2TokenizerFast
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
LANGUAGES = {
"en": "english",
"zh": "chinese",
"de": "german",
"es": "spanish",
"ru": "russian",
"ko": "korean",
"fr": "french",
"ja": "japanese",
"pt": "portuguese",
"tr": "turkish",
"pl": "polish",
"ca": "catalan",
"nl": "dutch",
"ar": "arabic",
"sv": "swedish",
"it": "italian",
"id": "indonesian",
"hi": "hindi",
"fi": "finnish",
"vi": "vietnamese",
"iw": "hebrew",
"uk": "ukrainian",
"el": "greek",
"ms": "malay",
"cs": "czech",
"ro": "romanian",
"da": "danish",
"hu": "hungarian",
"ta": "tamil",
"no": "norwegian",
"th": "thai",
"ur": "urdu",
"hr": "croatian",
"bg": "bulgarian",
"lt": "lithuanian",
"la": "latin",
"mi": "maori",
"ml": "malayalam",
"cy": "welsh",
"sk": "slovak",
"te": "telugu",
"fa": "persian",
"lv": "latvian",
"bn": "bengali",
"sr": "serbian",
"az": "azerbaijani",
"sl": "slovenian",
"kn": "kannada",
"et": "estonian",
"mk": "macedonian",
"br": "breton",
"eu": "basque",
"is": "icelandic",
"hy": "armenian",
"ne": "nepali",
"mn": "mongolian",
"bs": "bosnian",
"kk": "kazakh",
"sq": "albanian",
"sw": "swahili",
"gl": "galician",
"mr": "marathi",
"pa": "punjabi",
"si": "sinhala",
"km": "khmer",
"sn": "shona",
"yo": "yoruba",
"so": "somali",
"af": "afrikaans",
"oc": "occitan",
"ka": "georgian",
"be": "belarusian",
"tg": "tajik",
"sd": "sindhi",
"gu": "gujarati",
"am": "amharic",
"yi": "yiddish",
"lo": "lao",
"uz": "uzbek",
"fo": "faroese",
"ht": "haitian creole",
"ps": "pashto",
"tk": "turkmen",
"nn": "nynorsk",
"mt": "maltese",
"sa": "sanskrit",
"lb": "luxembourgish",
"my": "myanmar",
"bo": "tibetan",
"tl": "tagalog",
"mg": "malagasy",
"as": "assamese",
"tt": "tatar",
"haw": "hawaiian",
"ln": "lingala",
"ha": "hausa",
"ba": "bashkir",
"jw": "javanese",
"su": "sundanese",
}
#LANGUAGES = {
# "en": "english",
# "zh": "chinese",
# "de": "german",
# "es": "spanish",
# "ru": "russian",
# "ko": "korean",
# "fr": "french",
# "ja": "japanese",
# "pt": "portuguese",
# "tr": "turkish",
# "pl": "polish",
# "ca": "catalan",
# "nl": "dutch",
# "ar": "arabic",
# "sv": "swedish",
# "it": "italian",
# "id": "indonesian",
# "hi": "hindi",
# "fi": "finnish",
# "vi": "vietnamese",
# "iw": "hebrew",
# "uk": "ukrainian",
# "el": "greek",
# "ms": "malay",
# "cs": "czech",
# "ro": "romanian",
# "da": "danish",
# "hu": "hungarian",
# "ta": "tamil",
# "no": "norwegian",
# "th": "thai",
# "ur": "urdu",
# "hr": "croatian",
# "bg": "bulgarian",
# "lt": "lithuanian",
# "la": "latin",
# "mi": "maori",
# "ml": "malayalam",
# "cy": "welsh",
# "sk": "slovak",
# "te": "telugu",
# "fa": "persian",
# "lv": "latvian",
# "bn": "bengali",
# "sr": "serbian",
# "az": "azerbaijani",
# "sl": "slovenian",
# "kn": "kannada",
# "et": "estonian",
# "mk": "macedonian",
# "br": "breton",
# "eu": "basque",
# "is": "icelandic",
# "hy": "armenian",
# "ne": "nepali",
# "mn": "mongolian",
# "bs": "bosnian",
# "kk": "kazakh",
# "sq": "albanian",
# "sw": "swahili",
# "gl": "galician",
# "mr": "marathi",
# "pa": "punjabi",
# "si": "sinhala",
# "km": "khmer",
# "sn": "shona",
# "yo": "yoruba",
# "so": "somali",
# "af": "afrikaans",
# "oc": "occitan",
# "ka": "georgian",
# "be": "belarusian",
# "tg": "tajik",
# "sd": "sindhi",
# "gu": "gujarati",
# "am": "amharic",
# "yi": "yiddish",
# "lo": "lao",
# "uz": "uzbek",
# "fo": "faroese",
# "ht": "haitian creole",
# "ps": "pashto",
# "tk": "turkmen",
# "nn": "nynorsk",
# "mt": "maltese",
# "sa": "sanskrit",
# "lb": "luxembourgish",
# "my": "myanmar",
# "bo": "tibetan",
# "tl": "tagalog",
# "mg": "malagasy",
# "as": "assamese",
# "tt": "tatar",
# "haw": "hawaiian",
# "ln": "lingala",
# "ha": "hausa",
# "ba": "bashkir",
# "jw": "javanese",
# "su": "sundanese",
#}
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):