whisper : add OpenVINO support (#1037)

* openvino: use OpenVINO encoder inference

* openvino: add python script for OpenVINO model generation

* whisper: Fix 'unused' warnings when OpenVINO isn't enabled in build

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* whisper: Fix compilation error

* whisper: revert whisper_get_openvino_path_encoder & whisper_get_openvino_path_cache to non-const func signatures

* cmake: Add openvino-encoder as separate object target

* whisper : minor style fixes

* minor : indentation fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Ryan Metcalfe 2023-07-04 08:56:11 -04:00 committed by GitHub
parent 176d7e4e7b
commit 62b81276e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 367 additions and 3 deletions

View File

@ -54,6 +54,8 @@ option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
@ -192,6 +194,10 @@ if (WHISPER_CLBLAST)
endif()
endif()
if( WHISPER_OPENVINO )
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()
# compiler flags
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -297,6 +303,24 @@ if (WHISPER_COREML)
)
endif()
if (WHISPER_OPENVINO)
set(TARGET whisper.openvino)
add_library(${TARGET} OBJECT
openvino/whisper-openvino-encoder.h
openvino/whisper-openvino-encoder.cpp
)
target_include_directories(${TARGET} PUBLIC
.
)
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
target_link_libraries(${TARGET} PRIVATE openvino::runtime)
endif()
#
# whisper - this is the main library of the project
#
@ -322,6 +346,10 @@ if (WHISPER_COREML)
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
endif()
if (WHISPER_OPENVINO)
target_link_libraries(${TARGET} PRIVATE whisper.openvino)
endif()
if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

View File

@ -95,6 +95,8 @@ struct whisper_params {
// [TDRZ] speaker turn string
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
std::string openvino_encode_device = "CPU";
std::vector<std::string> fname_inp = {};
std::vector<std::string> fname_out = {};
};
@ -155,6 +157,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
@ -207,6 +210,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
fprintf(stderr, "\n");
}
@ -809,6 +813,9 @@ int main(int argc, char ** argv) {
return 3;
}
// initialize openvino encoder. This has no effect on whisper.cpp builds that don't have OpenVINO configured.
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];

View File

@ -0,0 +1,53 @@
import argparse
import torch
from whisper import load_model
import os
from openvino.tools import mo
from openvino.runtime import serialize
import shutil
def convert_encoder(hparams, encoder, mname):
encoder.eval()
mel = torch.zeros((1, 80, 3000))
onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder")
#create a directory to store the onnx model, and other collateral that is saved during onnx export procedure
if not os.path.isdir(onnx_folder):
os.makedirs(onnx_folder)
onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx")
torch.onnx.export(
encoder,
mel,
onnx_path,
input_names=["mel"],
output_names=["output_features"]
)
# use model optimizer to convert onnx to OpenVINO IR format
encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True)
serialize(encoder_model, xml_path='ggml-' + mname + '-encoder-openvino.xml')
#cleanup
if os.path.isdir(onnx_folder):
shutil.rmtree(onnx_folder)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu()
hparams = whisper.dims
encoder = whisper.encoder
# Convert encoder to onnx
convert_encoder(hparams, encoder, args.model)

View File

@ -0,0 +1,2 @@
openvino-dev[pytorch,onnx]
openai-whisper

View File

@ -0,0 +1,108 @@
#include "openvino/whisper-openvino-encoder.h"
#include "ggml.h"
#include <openvino/openvino.hpp>
#include <iostream>
struct whisper_openvino_context {
ov::InferRequest inferRequest;
};
struct whisper_openvino_context * whisper_openvino_init(const char* path_model,
const char* device,
const char* cache_dir)
{
if (!path_model || !device) {
fprintf(stderr, "%s: path_model and/or device is null\n", __func__);
return nullptr;
}
fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n",
__func__, path_model, device, cache_dir ? cache_dir : "(not set)");
whisper_openvino_context *context = new whisper_openvino_context;
try {
ov::Core core;
if (cache_dir) {
// enables caching of device-specific 'blobs' during core.compile_model
// routine. This speeds up calls to compile_model for successive runs.
core.set_property(ov::cache_dir(cache_dir));
}
//Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object.
std::shared_ptr<ov::Model> model = core.read_model(path_model);
// Produce a compiled-model object, given the device ("CPU", "GPU", etc.)
auto compiledModel = core.compile_model(model, device);
// From the compiled model object, create an infer request. This is the thing that we
// we will use later on to trigger inference execution.
context->inferRequest = compiledModel.create_infer_request();
}
catch (const std::exception& error) {
std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl;
delete context;
context = nullptr;
}
return context;
}
void whisper_openvino_free(struct whisper_openvino_context * ctx) {
if( ctx ) {
delete ctx;
}
}
int whisper_openvino_encode(
whisper_openvino_context* ctx,
ggml_tensor* mel,
ggml_tensor* out) {
if (!ctx || !mel || !out) {
fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__);
return 0;
}
if (mel->n_dims != 2) {
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
__func__, mel->n_dims);
return 0;
}
if (out->n_dims != 2) {
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
__func__, out->n_dims);
return 0;
}
try {
//wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request
{
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] };
ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] };
ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides);
ctx->inferRequest.set_input_tensor(input_tensor);
}
//wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request
{
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] };
ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] };
ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides);
ctx->inferRequest.set_output_tensor(out_tensor);
}
//run inference
ctx->inferRequest.infer();
}
catch (const std::exception& error) {
std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl;
return 0;
}
return 1;
}

View File

@ -0,0 +1,31 @@
// Wrapper of the OpenVINO Whisper Encoder model
//
#if __cplusplus
extern "C" {
#endif
struct whisper_openvino_context;
// initialize openvino encoder, given path to model xml, device ("CPU", "GPU", etc.), and
// path to cache_dir. Returns null upon failure.
struct whisper_openvino_context * whisper_openvino_init(const char * path_model,
const char * device,
const char * cache_dir);
// clean up a ctx previously returned from whisper_openvino_init()
void whisper_openvino_free(struct whisper_openvino_context * ctx);
struct ggml_tensor;
// Perform encode using OpenVINO.
// Returns 1 on success
// Returns 0 on failure
int whisper_openvino_encode(
whisper_openvino_context* ctx,
ggml_tensor* mel,
ggml_tensor* out);
#if __cplusplus
}
#endif

View File

@ -3,6 +3,10 @@
#include "coreml/whisper-encoder.h"
#endif
#if WHISPER_USE_OPENVINO
#include "openvino/whisper-openvino-encoder.h"
#endif
#include "ggml.h"
#include <algorithm>
@ -660,6 +664,10 @@ struct whisper_state {
whisper_coreml_context * ctx_coreml = nullptr;
#endif
#ifdef WHISPER_USE_OPENVINO
whisper_openvino_context * ctx_openvino = nullptr;
#endif
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0;
int64_t t_last = 0;
@ -1478,7 +1486,13 @@ static bool whisper_encode_internal(
const bool use_coreml = wstate.ctx_coreml != nullptr;
#endif
if (!use_coreml) {
#ifndef WHISPER_USE_OPENVINO
const bool use_openvino = false;
#else
const bool use_openvino = wstate.ctx_openvino != nullptr;
#endif
if (!use_coreml && !use_openvino) {
// convolution + gelu
{
wstate.use_buf(ctx0, 1);
@ -1777,8 +1791,7 @@ static bool whisper_encode_internal(
}
}
#ifdef WHISPER_USE_COREML
else
{
else if (use_coreml) {
wstate.use_buf(ctx0, -1);
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
@ -1786,6 +1799,17 @@ static bool whisper_encode_internal(
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
}
#endif
#ifdef WHISPER_USE_OPENVINO
else if (use_openvino) {
wstate.use_buf(ctx0, -1);
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
return false;
}
}
#endif
// cur
//{
@ -2628,6 +2652,31 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
}
#endif
#ifdef WHISPER_USE_OPENVINO
// replace .bin with-encoder-openvino.xml
static std::string whisper_get_openvino_path_encoder(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += "-encoder-openvino.xml";
return path_bin;
}
static std::string whisper_get_openvino_path_cache(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += "-encoder-openvino-cache";
return path_bin;
}
#endif
struct whisper_state * whisper_init_state(whisper_context * ctx) {
whisper_state * state = new whisper_state;
@ -2694,6 +2743,58 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
return state;
}
int whisper_ctx_init_openvino_encoder(struct whisper_context* ctx,
const char* openvino_model_path,
const char* openvino_device,
const char* openvino_cache_dir)
{
#ifndef WHISPER_USE_OPENVINO
(void)(ctx);
(void)(openvino_model_path);
(void)(openvino_device);
(void)(openvino_cache_dir);
return 0;
#else
if (!openvino_model_path && ctx->path_model.empty())
{
fprintf(stderr, "%s: openvino_model_path is nullptr, and ctx has no model_path set.\n", __func__);
return 0;
}
std::string path_openvino;
if (!openvino_model_path) {
//if openvino_model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
path_openvino = whisper_get_openvino_path_encoder(ctx->path_model);
}
else {
path_openvino = openvino_model_path;
}
std::string path_openvino_cache_dir;
if (!openvino_cache_dir) {
//if openvino_cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
path_openvino_cache_dir = whisper_get_openvino_path_cache(ctx->path_model);
}
else {
path_openvino_cache_dir = openvino_cache_dir;
}
fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_openvino.c_str());
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
ctx->state->ctx_openvino = whisper_openvino_init(path_openvino.c_str(), openvino_device, path_openvino_cache_dir.c_str());
if (!ctx->state->ctx_openvino) {
fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_openvino.c_str());
return 0;
}
else {
fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
}
return 1;
#endif
}
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
@ -2848,6 +2949,13 @@ void whisper_free_state(struct whisper_state * state)
}
#endif
#ifdef WHISPER_USE_OPENVINO
if (state->ctx_openvino != nullptr) {
whisper_openvino_free(state->ctx_openvino);
state->ctx_openvino = nullptr;
}
#endif
delete state;
}
}
@ -3287,6 +3395,14 @@ static int whisper_has_coreml(void) {
#endif
}
static int whisper_has_openvino(void) {
#ifdef WHISPER_USE_OPENVINO
return 1;
#else
return 0;
#endif
}
const char * whisper_print_system_info(void) {
static std::string s;
@ -3304,6 +3420,7 @@ const char * whisper_print_system_info(void) {
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
return s.c_str();
}

View File

@ -110,6 +110,24 @@ extern "C" {
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
// Given a context, enable use of OpenVINO for encode inference.
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
// the path will be generated from the ggml model path that was passed
// in to whisper_init_from_file. For example, if 'path_model' was
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
// cache_dir: Optional cache directory that can speed up init time, especially for
// GPU, by caching compiled 'blobs' there.
// Set to nullptr if not used.
// Returns 1 on success. If OpenVINO is not enabled in build, this
// simply returns 0.
WHISPER_API int whisper_ctx_init_openvino_encoder(
struct whisper_context * ctx,
const char * model_path,
const char * device,
const char * cache_dir);
// Frees all allocated memory
WHISPER_API void whisper_free (struct whisper_context * ctx);
WHISPER_API void whisper_free_state(struct whisper_state * state);