From c306a7fd89357866898c3bf709f362180d01b1c9 Mon Sep 17 00:00:00 2001 From: Qianhe Chen <54462604+chenqianhe@users.noreply.github.com> Date: Sat, 4 Feb 2023 15:10:25 +0800 Subject: [PATCH] addon.node : using whisper as a Node.js addon (#443) * addon: implement node addon call whisper through cpp * addon: modify the license to MIT * addon: remove iostream * addon: rename dir * addon: fix typo * addon: configure cmake to build when cmake-js is used --- examples/CMakeLists.txt | 2 + examples/addon.node/.gitignore | 3 + examples/addon.node/CMakeLists.txt | 26 ++ examples/addon.node/README.md | 37 +++ examples/addon.node/addon.cpp | 421 +++++++++++++++++++++++++++++ examples/addon.node/index.js | 27 ++ examples/addon.node/package.json | 12 + 7 files changed, 528 insertions(+) create mode 100644 examples/addon.node/.gitignore create mode 100644 examples/addon.node/CMakeLists.txt create mode 100644 examples/addon.node/README.md create mode 100644 examples/addon.node/addon.cpp create mode 100644 examples/addon.node/index.js create mode 100644 examples/addon.node/package.json diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 30c6a24a..01006c1a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -24,6 +24,8 @@ if (EMSCRIPTEN) add_subdirectory(command.wasm) add_subdirectory(talk.wasm) add_subdirectory(bench.wasm) +elseif(CMAKE_JS_VERSION) + add_subdirectory(addon.node) else() add_subdirectory(main) add_subdirectory(stream) diff --git a/examples/addon.node/.gitignore b/examples/addon.node/.gitignore new file mode 100644 index 00000000..b456cad9 --- /dev/null +++ b/examples/addon.node/.gitignore @@ -0,0 +1,3 @@ +.idea +node_modules +build diff --git a/examples/addon.node/CMakeLists.txt b/examples/addon.node/CMakeLists.txt new file mode 100644 index 00000000..142d5eda --- /dev/null +++ b/examples/addon.node/CMakeLists.txt @@ -0,0 +1,26 @@ +set(TARGET whisper-addon) + +# Base settings +#================================================================== +# env var supported by cmake-js +add_definitions(-DNAPI_VERSION=4) +include_directories(${CMAKE_JS_INC}) +#================================================================== + +add_library(${TARGET} SHARED ${CMAKE_JS_SRC} addon.cpp) +set_target_properties(${TARGET} PROPERTIES PREFIX "" SUFFIX ".node") + +include(DefaultTargetOptions) + +# Include N-API wrappers +#================================================================== +execute_process(COMMAND node -p "require('node-addon-api').include" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE NODE_ADDON_API_DIR + ) +string(REPLACE "\n" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR}) +string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR}) +target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR}) +#================================================================== + +target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md new file mode 100644 index 00000000..d14dde41 --- /dev/null +++ b/examples/addon.node/README.md @@ -0,0 +1,37 @@ +# addon + +This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js). +It can be used as a reference for using the whisper.cpp project in other node projects. + +## Install + +```shell +npm install +``` + +## Compile + +Make sure it is in the project root directory and compiled with make-js. + +```shell +npx cmake-js compile -T whisper-addon +``` + +For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes. + +> Such as appointing special cmake path: +> ```shell +> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon +> ``` + +## Run + +```shell +cd examples/addon.node + +node index.js --language='language' --model='model-path' --fname_inp='file-path' +``` + +Because this is a simple Demo, only the above parameters are set in the node environment. + +Other parameters can also be specified in the node environment. diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp new file mode 100644 index 00000000..053fb3a9 --- /dev/null +++ b/examples/addon.node/addon.cpp @@ -0,0 +1,421 @@ +#include +#include +#include +#include + +#include "napi.h" + +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" + +#include "whisper.h" + +struct whisper_params { + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_processors = 1; + int32_t offset_t_ms = 0; + int32_t offset_n = 0; + int32_t duration_ms = 0; + int32_t max_context = -1; + int32_t max_len = 0; + int32_t best_of = 5; + int32_t beam_size = -1; + + float word_thold = 0.01f; + float entropy_thold = 2.4f; + float logprob_thold = -1.0f; + + bool speed_up = false; + bool translate = false; + bool diarize = false; + bool output_txt = false; + bool output_vtt = false; + bool output_srt = false; + bool output_wts = false; + bool output_csv = false; + bool print_special = false; + bool print_colors = false; + bool print_progress = false; + bool no_timestamps = false; + + std::string language = "en"; + std::string prompt; + std::string model = "../../ggml-large.bin"; + + std::vector fname_inp = {}; + std::vector fname_outp = {}; +}; + +struct whisper_print_user_data { + const whisper_params * params; + + const std::vector> * pcmf32s; +}; + +// 500 -> 00:05.000 +// 6000 -> 01:00.000 +std::string to_timestamp(int64_t t, bool comma = false) { + int64_t msec = t * 10; + int64_t hr = msec / (1000 * 60 * 60); + msec = msec - hr * (1000 * 60 * 60); + int64_t min = msec / (1000 * 60); + msec = msec - min * (1000 * 60); + int64_t sec = msec / 1000; + msec = msec - sec * 1000; + + char buf[32]; + snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec); + + return std::string(buf); +} + +int timestamp_to_sample(int64_t t, int n_samples) { + return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100))); +} + +void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) { + const auto & params = *((whisper_print_user_data *) user_data)->params; + const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s; + + const int n_segments = whisper_full_n_segments(ctx); + + std::string speaker = ""; + + int64_t t0; + int64_t t1; + + // print the last n_new segments + const int s0 = n_segments - n_new; + + if (s0 == 0) { + printf("\n"); + } + + for (int i = s0; i < n_segments; i++) { + if (!params.no_timestamps || params.diarize) { + t0 = whisper_full_get_segment_t0(ctx, i); + t1 = whisper_full_get_segment_t1(ctx, i); + } + + if (!params.no_timestamps) { + printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str()); + } + + if (params.diarize && pcmf32s.size() == 2) { + const int64_t n_samples = pcmf32s[0].size(); + + const int64_t is0 = timestamp_to_sample(t0, n_samples); + const int64_t is1 = timestamp_to_sample(t1, n_samples); + + double energy0 = 0.0f; + double energy1 = 0.0f; + + for (int64_t j = is0; j < is1; j++) { + energy0 += fabs(pcmf32s[0][j]); + energy1 += fabs(pcmf32s[1][j]); + } + + if (energy0 > 1.1*energy1) { + speaker = "(speaker 0)"; + } else if (energy1 > 1.1*energy0) { + speaker = "(speaker 1)"; + } else { + speaker = "(speaker ?)"; + } + + //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str()); + } + + // colorful print bug + // + const char * text = whisper_full_get_segment_text(ctx, i); + printf("%s%s", speaker.c_str(), text); + + + // with timestamps or speakers: each segment on new line + if (!params.no_timestamps || params.diarize) { + printf("\n"); + } + + fflush(stdout); + } +} + +int run(whisper_params ¶ms, std::vector> &result) { + + if (params.fname_inp.empty()) { + fprintf(stderr, "error: no input files specified\n"); + return 2; + } + + if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) { + fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str()); + exit(0); + } + + // whisper init + + struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + + if (ctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + return 3; + } + + // initial prompt + std::vector prompt_tokens; + + if (!params.prompt.empty()) { + prompt_tokens.resize(1024); + prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size())); + + fprintf(stderr, "\n"); + fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str()); + fprintf(stderr, "initial tokens: [ "); + for (int i = 0; i < (int) prompt_tokens.size(); ++i) { + fprintf(stderr, "%d ", prompt_tokens[i]); + } + fprintf(stderr, "]\n"); + } + + for (int f = 0; f < (int) params.fname_inp.size(); ++f) { + const auto fname_inp = params.fname_inp[f]; + const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f]; + + std::vector pcmf32; // mono-channel F32 PCM + std::vector> pcmf32s; // stereo-channel F32 PCM + + // WAV input + { + drwav wav; + std::vector wav_data; // used for pipe input from stdin + + if (fname_inp == "-") { + { + uint8_t buf[1024]; + while (true) + { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + wav_data.insert(wav_data.end(), buf, buf + n); + } + } + + if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { + fprintf(stderr, "error: failed to open WAV file from stdin\n"); + return 4; + } + + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + } + else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) { + fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str()); + return 5; + } + + if (wav.channels != 1 && wav.channels != 2) { + fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str()); + return 6; + } + + if (params.diarize && wav.channels != 2 && params.no_timestamps == false) { + fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str()); + return 6; + } + + if (wav.sampleRate != WHISPER_SAMPLE_RATE) { + fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000); + return 8; + } + + if (wav.bitsPerSample != 16) { + fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str()); + return 9; + } + + const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); + + std::vector pcm16; + pcm16.resize(n*wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i])/32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; + } + } + + if (params.diarize) { + // convert to stereo, float + pcmf32s.resize(2); + + pcmf32s[0].resize(n); + pcmf32s[1].resize(n); + for (uint64_t i = 0; i < n; i++) { + pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; + pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; + } + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info()); + } + + // print some info about the processing + { + fprintf(stderr, "\n"); + if (!whisper_is_multilingual(ctx)) { + if (params.language != "en" || params.translate) { + params.language = "en"; + params.translate = false; + fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__); + } + } + fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n", + __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, + params.n_threads, params.n_processors, + params.language.c_str(), + params.translate ? "translate" : "transcribe", + params.no_timestamps ? 0 : 1); + + fprintf(stderr, "\n"); + } + + // run the inference + { + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + + wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY; + + wparams.print_realtime = false; + wparams.print_progress = params.print_progress; + wparams.print_timestamps = !params.no_timestamps; + wparams.print_special = params.print_special; + wparams.translate = params.translate; + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; + wparams.offset_ms = params.offset_t_ms; + wparams.duration_ms = params.duration_ms; + + wparams.token_timestamps = params.output_wts || params.max_len > 0; + wparams.thold_pt = params.word_thold; + wparams.entropy_thold = params.entropy_thold; + wparams.logprob_thold = params.logprob_thold; + wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len; + + wparams.speed_up = params.speed_up; + + wparams.greedy.best_of = params.best_of; + wparams.beam_search.beam_size = params.beam_size; + + wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data(); + wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size(); + + whisper_print_user_data user_data = { ¶ms, &pcmf32s }; + + // this callback is called on each new segment + if (!wparams.print_realtime) { + wparams.new_segment_callback = whisper_print_segment_callback; + wparams.new_segment_callback_user_data = &user_data; + } + + // example for abort mechanism + // in this example, we do not abort the processing, but we could if the flag is set to true + // the callback is called before every encoder run - if it returns false, the processing is aborted + { + static bool is_aborted = false; // NOTE: this should be atomic to avoid data race + + wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) { + bool is_aborted = *(bool*)user_data; + return !is_aborted; + }; + wparams.encoder_begin_callback_user_data = &is_aborted; + } + + if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) { + fprintf(stderr, "failed to process audio\n"); + return 10; + } + } + } + + const int n_segments = whisper_full_n_segments(ctx); + result.resize(n_segments); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + + result[i].emplace_back(to_timestamp(t0, true)); + result[i].emplace_back(to_timestamp(t1, true)); + result[i].emplace_back(text); + } + + whisper_print_timings(ctx); + whisper_free(ctx); + + return 0; +} + +Napi::Object whisper(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + if (info.Length() <= 0 || !info[0].IsObject()) { + Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException(); + } + whisper_params params; + std::vector> result; + + Napi::Object whisper_params = info[0].As(); + std::string language = whisper_params.Get("language").As(); + std::string model = whisper_params.Get("model").As(); + std::string input = whisper_params.Get("fname_inp").As(); + + params.language = language; + params.model = model; + params.fname_inp.emplace_back(input); + + // run model + run(params, result); + + fprintf(stderr, "RESULT:\n"); + for (auto sentence:result) { + fprintf(stderr, "t0: %s, t1: %s, content: %s \n", + sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str()); + } + + Napi::Object res = Napi::Array::New(env, result.size()); + for (u_int32_t i = 0; i < result.size(); ++i) { + Napi::Object tmp = Napi::Array::New(env, 3); + for (u_int32_t j = 0; j < 3; ++j) { + tmp[j] = Napi::String::New(env, result[i][j]); + } + res[i] = tmp; + } + + return res; +} + + +Napi::Object Init(Napi::Env env, Napi::Object exports) { + exports.Set( + Napi::String::New(env, "whisper"), + Napi::Function::New(env, whisper) + ); + return exports; +} + +NODE_API_MODULE(whisper, Init); diff --git a/examples/addon.node/index.js b/examples/addon.node/index.js new file mode 100644 index 00000000..c9038fae --- /dev/null +++ b/examples/addon.node/index.js @@ -0,0 +1,27 @@ +const path = require('path'); +const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon')); + +const whisperParams = { + language: 'en', + model: path.join(__dirname, '../../models/ggml-base.en.bin'), + fname_inp: '', +}; + +const arguments = process.argv.slice(2); +const params = Object.fromEntries( + arguments.reduce((pre, item) => { + if (item.startsWith("--")) { + return [...pre, item.slice(2).split("=")]; + } + return pre; + }, []), +); + +for (const key in params) { + if (whisperParams.hasOwnProperty(key)) { + whisperParams[key] = params[key]; + } +} + +console.log('whisperParams =', whisperParams); +console.log(whisper(whisperParams)); diff --git a/examples/addon.node/package.json b/examples/addon.node/package.json new file mode 100644 index 00000000..eaad69ce --- /dev/null +++ b/examples/addon.node/package.json @@ -0,0 +1,12 @@ +{ + "name": "whisper-addon", + "version": "0.0.0", + "description": "", + "main": "index.js", + "author": "Qanhe Chen", + "license": "MIT", + "devDependencies": { + "cmake-js": "^7.1.1", + "node-addon-api": "^5.0.0" + } +}