mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
models : change convert-pt-to-ggml to use .tiktoken tokenizer files (#725)
This commit is contained in:
parent
61128870b8
commit
62b51c3070
@ -39,6 +39,7 @@ import json
|
|||||||
import code
|
import code
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import base64
|
||||||
|
|
||||||
#from transformers import GPTJForCausalLM
|
#from transformers import GPTJForCausalLM
|
||||||
#from transformers import GPT2TokenizerFast
|
#from transformers import GPT2TokenizerFast
|
||||||
@ -224,18 +225,14 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
|||||||
#code.interact(local=locals())
|
#code.interact(local=locals())
|
||||||
|
|
||||||
multilingual = hparams["n_vocab"] == 51865
|
multilingual = hparams["n_vocab"] == 51865
|
||||||
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
|
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
|
||||||
|
|
||||||
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
|
||||||
#print(tokenizer)
|
|
||||||
#print(tokenizer.name_or_path)
|
|
||||||
#print(len(tokenizer.additional_special_tokens))
|
|
||||||
|
|
||||||
# output in the same directory as the model
|
# output in the same directory as the model
|
||||||
fname_out = dir_out + "/ggml-model.bin"
|
fname_out = dir_out + "/ggml-model.bin"
|
||||||
|
|
||||||
with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
|
with open(tokenizer, "rb") as f:
|
||||||
tokens = json.load(f)
|
contents = f.read()
|
||||||
|
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
|
||||||
|
|
||||||
# use 16-bit or 32-bit floats
|
# use 16-bit or 32-bit floats
|
||||||
use_f16 = True
|
use_f16 = True
|
||||||
@ -271,9 +268,8 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
|
|||||||
fout.write(struct.pack("i", len(tokens)))
|
fout.write(struct.pack("i", len(tokens)))
|
||||||
|
|
||||||
for key in tokens:
|
for key in tokens:
|
||||||
text = bytearray([byte_decoder[c] for c in key])
|
fout.write(struct.pack("i", len(key)))
|
||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(key)
|
||||||
fout.write(text)
|
|
||||||
|
|
||||||
for name in list_vars.keys():
|
for name in list_vars.keys():
|
||||||
data = list_vars[name].squeeze().numpy()
|
data = list_vars[name].squeeze().numpy()
|
||||||
|
Loading…
Reference in New Issue
Block a user