whisper.cpp/openvino/whisper-openvino-encoder.cpp

#include "openvino/whisper-openvino-encoder.h"
#include "ggml.h"
#include <openvino/openvino.hpp>
#include <iostream>

struct whisper_openvino_context {
    ov::InferRequest inferRequest;
};

struct whisper_openvino_context * whisper_openvino_init(const char* path_model,
    const char* device,
    const char* cache_dir)
{
    if (!path_model || !device) {
        fprintf(stderr, "%s: path_model and/or device is null\n", __func__);
        return nullptr;
    }

    fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n",
        __func__, path_model, device, cache_dir ? cache_dir : "(not set)");

	whisper_openvino_context *context = new whisper_openvino_context;
    try {
        ov::Core core;

        if (cache_dir) {
            // enables caching of device-specific 'blobs' during core.compile_model
            // routine. This speeds up calls to compile_model for successive runs.
            core.set_property(ov::cache_dir(cache_dir));
        }

        //Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object.
        std::shared_ptr<ov::Model> model = core.read_model(path_model);

        // Produce a compiled-model object, given the device ("CPU", "GPU", etc.)
        auto compiledModel = core.compile_model(model, device);

        // From the compiled model object, create an infer request. This is the thing that we
        //  we will use later on to trigger inference execution.
        context->inferRequest = compiledModel.create_infer_request();
    }
    catch (const std::exception& error) {
        std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl;
        delete context;
        context = nullptr;
    }

    return context;
}

void whisper_openvino_free(struct whisper_openvino_context * ctx) {
    if( ctx ) {
        delete ctx;
    }
}

int whisper_openvino_encode(
    whisper_openvino_context* ctx,
    ggml_tensor* mel,
    ggml_tensor* out) {

    if (!ctx || !mel || !out) {
        fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__);
        return 0;
    }

    if (mel->n_dims != 2) {
        fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
            __func__, mel->n_dims);
        return 0;
    }

    if (out->n_dims != 2) {
        fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
            __func__, out->n_dims);
        return 0;
    }

    try {

        //wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request
        {
            // note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
            ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] };
            ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] };
            ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides);
            ctx->inferRequest.set_input_tensor(input_tensor);
        }

        //wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request
        {
            // note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
            ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] };
            ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] };
            ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides);
            ctx->inferRequest.set_output_tensor(out_tensor);
        }

        //run inference
        ctx->inferRequest.infer();
    }
    catch (const std::exception& error) {
        std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl;
        return 0;
    }

    return 1;
}
whisper : add OpenVINO support (#1037) * openvino: use OpenVINO encoder inference * openvino: add python script for OpenVINO model generation * whisper: Fix 'unused' warnings when OpenVINO isn't enabled in build * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * whisper: Fix compilation error * whisper: revert whisper_get_openvino_path_encoder & whisper_get_openvino_path_cache to non-const func signatures * cmake: Add openvino-encoder as separate object target * whisper : minor style fixes * minor : indentation fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 12:56:11 +00:00			`#include "openvino/whisper-openvino-encoder.h"`
			`#include "ggml.h"`
			`#include <openvino/openvino.hpp>`
			`#include <iostream>`

			`struct whisper_openvino_context {`
			`ov::InferRequest inferRequest;`
			`};`

			`struct whisper_openvino_context * whisper_openvino_init(const char* path_model,`
			`const char* device,`
			`const char* cache_dir)`
			`{`
			`if (!path_model \|\| !device) {`
			`fprintf(stderr, "%s: path_model and/or device is null\n", __func__);`
			`return nullptr;`
			`}`

			`fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n",`
			`__func__, path_model, device, cache_dir ? cache_dir : "(not set)");`

			`whisper_openvino_context *context = new whisper_openvino_context;`
			`try {`
			`ov::Core core;`

			`if (cache_dir) {`
			`// enables caching of device-specific 'blobs' during core.compile_model`
			`// routine. This speeds up calls to compile_model for successive runs.`
			`core.set_property(ov::cache_dir(cache_dir));`
			`}`

			`//Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object.`
			`std::shared_ptr<ov::Model> model = core.read_model(path_model);`

			`// Produce a compiled-model object, given the device ("CPU", "GPU", etc.)`
			`auto compiledModel = core.compile_model(model, device);`

			`// From the compiled model object, create an infer request. This is the thing that we`
			`// we will use later on to trigger inference execution.`
			`context->inferRequest = compiledModel.create_infer_request();`
			`}`
			`catch (const std::exception& error) {`
			`std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl;`
			`delete context;`
			`context = nullptr;`
			`}`

			`return context;`
			`}`

			`void whisper_openvino_free(struct whisper_openvino_context * ctx) {`
			`if( ctx ) {`
			`delete ctx;`
			`}`
			`}`

			`int whisper_openvino_encode(`
			`whisper_openvino_context* ctx,`
			`ggml_tensor* mel,`
			`ggml_tensor* out) {`

			`if (!ctx \|\| !mel \|\| !out) {`
			`fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__);`
			`return 0;`
			`}`

			`if (mel->n_dims != 2) {`
			`fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",`
			`__func__, mel->n_dims);`
			`return 0;`
			`}`

			`if (out->n_dims != 2) {`
			`fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",`
			`__func__, out->n_dims);`
			`return 0;`
			`}`

			`try {`

			`//wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request`
			`{`
			`// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays`
			`ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] };`
			`ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] };`
			`ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides);`
			`ctx->inferRequest.set_input_tensor(input_tensor);`
			`}`

			`//wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request`
			`{`
			`// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays`
			`ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] };`
			`ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] };`
			`ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides);`
			`ctx->inferRequest.set_output_tensor(out_tensor);`
			`}`

			`//run inference`
			`ctx->inferRequest.infer();`
			`}`
			`catch (const std::exception& error) {`
			`std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl;`
			`return 0;`
			`}`

			`return 1;`
			`}`