From 00f46dbc1db35e98007bdfb4fc69f7777fe78a50 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 23 Nov 2022 23:22:40 +0200 Subject: [PATCH] models : add usage comments to the HF convert script (#157) --- models/convert-h5-to-ggml.py | 17 +++ models/convert-pt-to-ggml.py | 202 +++++++++++++++++------------------ 2 files changed, 118 insertions(+), 101 deletions(-) diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py index f236355..b882c4d 100644 --- a/models/convert-h5-to-ggml.py +++ b/models/convert-h5-to-ggml.py @@ -1,3 +1,20 @@ +# Convert Hugging Face fine-tuned models to ggml format +# +# Usage: +# +# git clone https://github.com/openai/whisper +# git clone https://github.com/ggerganov/whisper.cpp +# git clone https://huggingface.co/openai/whisper-medium +# +# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper . +# +# This script is similar to "convert-pt-to-ggml.py" +# +# For more info: +# +# https://github.com/ggerganov/whisper.cpp/issues/157 +# + import io import os import sys diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py index ef4759f..5cf9cf9 100644 --- a/models/convert-pt-to-ggml.py +++ b/models/convert-pt-to-ggml.py @@ -44,107 +44,107 @@ import numpy as np #from transformers import GPT2TokenizerFast # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 -LANGUAGES = { - "en": "english", - "zh": "chinese", - "de": "german", - "es": "spanish", - "ru": "russian", - "ko": "korean", - "fr": "french", - "ja": "japanese", - "pt": "portuguese", - "tr": "turkish", - "pl": "polish", - "ca": "catalan", - "nl": "dutch", - "ar": "arabic", - "sv": "swedish", - "it": "italian", - "id": "indonesian", - "hi": "hindi", - "fi": "finnish", - "vi": "vietnamese", - "iw": "hebrew", - "uk": "ukrainian", - "el": "greek", - "ms": "malay", - "cs": "czech", - "ro": "romanian", - "da": "danish", - "hu": "hungarian", - "ta": "tamil", - "no": "norwegian", - "th": "thai", - "ur": "urdu", - "hr": "croatian", - "bg": "bulgarian", - "lt": "lithuanian", - "la": "latin", - "mi": "maori", - "ml": "malayalam", - "cy": "welsh", - "sk": "slovak", - "te": "telugu", - "fa": "persian", - "lv": "latvian", - "bn": "bengali", - "sr": "serbian", - "az": "azerbaijani", - "sl": "slovenian", - "kn": "kannada", - "et": "estonian", - "mk": "macedonian", - "br": "breton", - "eu": "basque", - "is": "icelandic", - "hy": "armenian", - "ne": "nepali", - "mn": "mongolian", - "bs": "bosnian", - "kk": "kazakh", - "sq": "albanian", - "sw": "swahili", - "gl": "galician", - "mr": "marathi", - "pa": "punjabi", - "si": "sinhala", - "km": "khmer", - "sn": "shona", - "yo": "yoruba", - "so": "somali", - "af": "afrikaans", - "oc": "occitan", - "ka": "georgian", - "be": "belarusian", - "tg": "tajik", - "sd": "sindhi", - "gu": "gujarati", - "am": "amharic", - "yi": "yiddish", - "lo": "lao", - "uz": "uzbek", - "fo": "faroese", - "ht": "haitian creole", - "ps": "pashto", - "tk": "turkmen", - "nn": "nynorsk", - "mt": "maltese", - "sa": "sanskrit", - "lb": "luxembourgish", - "my": "myanmar", - "bo": "tibetan", - "tl": "tagalog", - "mg": "malagasy", - "as": "assamese", - "tt": "tatar", - "haw": "hawaiian", - "ln": "lingala", - "ha": "hausa", - "ba": "bashkir", - "jw": "javanese", - "su": "sundanese", -} +#LANGUAGES = { +# "en": "english", +# "zh": "chinese", +# "de": "german", +# "es": "spanish", +# "ru": "russian", +# "ko": "korean", +# "fr": "french", +# "ja": "japanese", +# "pt": "portuguese", +# "tr": "turkish", +# "pl": "polish", +# "ca": "catalan", +# "nl": "dutch", +# "ar": "arabic", +# "sv": "swedish", +# "it": "italian", +# "id": "indonesian", +# "hi": "hindi", +# "fi": "finnish", +# "vi": "vietnamese", +# "iw": "hebrew", +# "uk": "ukrainian", +# "el": "greek", +# "ms": "malay", +# "cs": "czech", +# "ro": "romanian", +# "da": "danish", +# "hu": "hungarian", +# "ta": "tamil", +# "no": "norwegian", +# "th": "thai", +# "ur": "urdu", +# "hr": "croatian", +# "bg": "bulgarian", +# "lt": "lithuanian", +# "la": "latin", +# "mi": "maori", +# "ml": "malayalam", +# "cy": "welsh", +# "sk": "slovak", +# "te": "telugu", +# "fa": "persian", +# "lv": "latvian", +# "bn": "bengali", +# "sr": "serbian", +# "az": "azerbaijani", +# "sl": "slovenian", +# "kn": "kannada", +# "et": "estonian", +# "mk": "macedonian", +# "br": "breton", +# "eu": "basque", +# "is": "icelandic", +# "hy": "armenian", +# "ne": "nepali", +# "mn": "mongolian", +# "bs": "bosnian", +# "kk": "kazakh", +# "sq": "albanian", +# "sw": "swahili", +# "gl": "galician", +# "mr": "marathi", +# "pa": "punjabi", +# "si": "sinhala", +# "km": "khmer", +# "sn": "shona", +# "yo": "yoruba", +# "so": "somali", +# "af": "afrikaans", +# "oc": "occitan", +# "ka": "georgian", +# "be": "belarusian", +# "tg": "tajik", +# "sd": "sindhi", +# "gu": "gujarati", +# "am": "amharic", +# "yi": "yiddish", +# "lo": "lao", +# "uz": "uzbek", +# "fo": "faroese", +# "ht": "haitian creole", +# "ps": "pashto", +# "tk": "turkmen", +# "nn": "nynorsk", +# "mt": "maltese", +# "sa": "sanskrit", +# "lb": "luxembourgish", +# "my": "myanmar", +# "bo": "tibetan", +# "tl": "tagalog", +# "mg": "malagasy", +# "as": "assamese", +# "tt": "tatar", +# "haw": "hawaiian", +# "ln": "lingala", +# "ha": "hausa", +# "ba": "bashkir", +# "jw": "javanese", +# "su": "sundanese", +#} ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):