models : add usage comments to the HF convert script (#157)

This commit is contained in:
Georgi Gerganov 2022-11-23 23:22:40 +02:00
parent 5698bddbc9
commit 00f46dbc1d
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 118 additions and 101 deletions

View File

@ -1,3 +1,20 @@
# Convert Hugging Face fine-tuned models to ggml format
#
# Usage:
#
# git clone https://github.com/openai/whisper
# git clone https://github.com/ggerganov/whisper.cpp
# git clone https://huggingface.co/openai/whisper-medium
#
# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
#
# This script is similar to "convert-pt-to-ggml.py"
#
# For more info:
#
# https://github.com/ggerganov/whisper.cpp/issues/157
#
import io import io
import os import os
import sys import sys

View File

@ -44,107 +44,107 @@ import numpy as np
#from transformers import GPT2TokenizerFast #from transformers import GPT2TokenizerFast
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
LANGUAGES = { #LANGUAGES = {
"en": "english", # "en": "english",
"zh": "chinese", # "zh": "chinese",
"de": "german", # "de": "german",
"es": "spanish", # "es": "spanish",
"ru": "russian", # "ru": "russian",
"ko": "korean", # "ko": "korean",
"fr": "french", # "fr": "french",
"ja": "japanese", # "ja": "japanese",
"pt": "portuguese", # "pt": "portuguese",
"tr": "turkish", # "tr": "turkish",
"pl": "polish", # "pl": "polish",
"ca": "catalan", # "ca": "catalan",
"nl": "dutch", # "nl": "dutch",
"ar": "arabic", # "ar": "arabic",
"sv": "swedish", # "sv": "swedish",
"it": "italian", # "it": "italian",
"id": "indonesian", # "id": "indonesian",
"hi": "hindi", # "hi": "hindi",
"fi": "finnish", # "fi": "finnish",
"vi": "vietnamese", # "vi": "vietnamese",
"iw": "hebrew", # "iw": "hebrew",
"uk": "ukrainian", # "uk": "ukrainian",
"el": "greek", # "el": "greek",
"ms": "malay", # "ms": "malay",
"cs": "czech", # "cs": "czech",
"ro": "romanian", # "ro": "romanian",
"da": "danish", # "da": "danish",
"hu": "hungarian", # "hu": "hungarian",
"ta": "tamil", # "ta": "tamil",
"no": "norwegian", # "no": "norwegian",
"th": "thai", # "th": "thai",
"ur": "urdu", # "ur": "urdu",
"hr": "croatian", # "hr": "croatian",
"bg": "bulgarian", # "bg": "bulgarian",
"lt": "lithuanian", # "lt": "lithuanian",
"la": "latin", # "la": "latin",
"mi": "maori", # "mi": "maori",
"ml": "malayalam", # "ml": "malayalam",
"cy": "welsh", # "cy": "welsh",
"sk": "slovak", # "sk": "slovak",
"te": "telugu", # "te": "telugu",
"fa": "persian", # "fa": "persian",
"lv": "latvian", # "lv": "latvian",
"bn": "bengali", # "bn": "bengali",
"sr": "serbian", # "sr": "serbian",
"az": "azerbaijani", # "az": "azerbaijani",
"sl": "slovenian", # "sl": "slovenian",
"kn": "kannada", # "kn": "kannada",
"et": "estonian", # "et": "estonian",
"mk": "macedonian", # "mk": "macedonian",
"br": "breton", # "br": "breton",
"eu": "basque", # "eu": "basque",
"is": "icelandic", # "is": "icelandic",
"hy": "armenian", # "hy": "armenian",
"ne": "nepali", # "ne": "nepali",
"mn": "mongolian", # "mn": "mongolian",
"bs": "bosnian", # "bs": "bosnian",
"kk": "kazakh", # "kk": "kazakh",
"sq": "albanian", # "sq": "albanian",
"sw": "swahili", # "sw": "swahili",
"gl": "galician", # "gl": "galician",
"mr": "marathi", # "mr": "marathi",
"pa": "punjabi", # "pa": "punjabi",
"si": "sinhala", # "si": "sinhala",
"km": "khmer", # "km": "khmer",
"sn": "shona", # "sn": "shona",
"yo": "yoruba", # "yo": "yoruba",
"so": "somali", # "so": "somali",
"af": "afrikaans", # "af": "afrikaans",
"oc": "occitan", # "oc": "occitan",
"ka": "georgian", # "ka": "georgian",
"be": "belarusian", # "be": "belarusian",
"tg": "tajik", # "tg": "tajik",
"sd": "sindhi", # "sd": "sindhi",
"gu": "gujarati", # "gu": "gujarati",
"am": "amharic", # "am": "amharic",
"yi": "yiddish", # "yi": "yiddish",
"lo": "lao", # "lo": "lao",
"uz": "uzbek", # "uz": "uzbek",
"fo": "faroese", # "fo": "faroese",
"ht": "haitian creole", # "ht": "haitian creole",
"ps": "pashto", # "ps": "pashto",
"tk": "turkmen", # "tk": "turkmen",
"nn": "nynorsk", # "nn": "nynorsk",
"mt": "maltese", # "mt": "maltese",
"sa": "sanskrit", # "sa": "sanskrit",
"lb": "luxembourgish", # "lb": "luxembourgish",
"my": "myanmar", # "my": "myanmar",
"bo": "tibetan", # "bo": "tibetan",
"tl": "tagalog", # "tl": "tagalog",
"mg": "malagasy", # "mg": "malagasy",
"as": "assamese", # "as": "assamese",
"tt": "tatar", # "tt": "tatar",
"haw": "hawaiian", # "haw": "hawaiian",
"ln": "lingala", # "ln": "lingala",
"ha": "hausa", # "ha": "hausa",
"ba": "bashkir", # "ba": "bashkir",
"jw": "javanese", # "jw": "javanese",
"su": "sundanese", # "su": "sundanese",
} #}
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):