release : v1.2.1

make : add "-mcpu=native" when building for aarch64 (#532 )
readme : add pybind11 bindings (#538 )
2025-06-25 01:19:10 +00:00 · 2023-02-28 22:29:12 +02:00 · 2023-02-27 21:04:16 +02:00 · 2023-02-27 21:02:11 +02:00 · 2023-02-24 08:46:06 +02:00 · 2023-02-21 19:00:42 +02:00
8 changed files with 38 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
+build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.2.0)
+project(whisper.cpp VERSION 1.2.1)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/2
+++ b/2
@ -141,6 +141,8 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -469,7 +469,9 @@ in [models](models).
 - [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
+  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)

 ## Examples

--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.0",
+  "version": "1.2.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/whisper.cpp
+++ b/whisper.cpp
@ -592,16 +592,16 @@ struct whisper_context {

    mutable std::mt19937 rng; // used for sampling at t > 0.0

-    int lang_id;
+    int lang_id = 0; // english by default

    // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg;
-    int64_t t_last;
+    int64_t t_beg = 0;
+    int64_t t_last = 0;
    whisper_token tid_last;
    std::vector<float> energy; // PCM signal energy

    // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx; // 0 - use default
+    int32_t exp_n_audio_ctx = 0; // 0 - use default

    void use_buf(struct ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
@ -805,7 +805,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                     MEM_REQ_SCRATCH3.at (model.type) +
                scale*MEM_REQ_MODEL.at   (model.type) +
                scale*MEM_REQ_KV_CROSS.at(model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type));
+                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));

            // this is the memory required by one decoder
            const size_t mem_required_decoder =
@ -2962,6 +2962,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str

        /*.encoder_begin_callback           =*/ nullptr,
        /*.encoder_begin_callback_user_data =*/ nullptr,
+
+        /*.logits_filter_callback           =*/ nullptr,
+        /*.logits_filter_callback_user_data =*/ nullptr,
    };

    switch (strategy) {
@ -3089,7 +3092,7 @@ static const std::vector<std::string> non_speech_tokens = {
 // - applies logit filters
 // - computes logprobs and probs
 static void whisper_process_logits(
-        const struct whisper_context & ctx,
+              struct whisper_context & ctx,
    const struct whisper_full_params   params,
              struct whisper_decoder & decoder,
                               float   temperature) {
@ -3145,6 +3148,9 @@ static void whisper_process_logits(
        logits[vocab.token_translate]  = -INFINITY;
        logits[vocab.token_transcribe] = -INFINITY;

+        if (params.logits_filter_callback) {
+            params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
+        }

        // suppress non-speech tokens
        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
@ -3848,7 +3854,7 @@ int whisper_full(
                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
                    });

-                    unsigned int cur_c = 0;
+                    uint32_t cur_c = 0;

                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = ctx->decoders[j];
@ -4333,7 +4339,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
 }

 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id; 
+    return ctx->lang_id;
 }

 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
--- a/whisper.h
+++ b/whisper.h
@ -243,6 +243,16 @@ extern "C" {
    // If it returns false, the computation is aborted
    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);

+    // Logits filter callback
+    // Can be used to modify the logits before sampling
+    // If not NULL, called after applying temperature to logits
+    typedef void (*whisper_logits_filter_callback)(
+            struct whisper_context * ctx,
+          const whisper_token_data * tokens,
+                               int   n_tokens,
+                             float * logits,
+                              void * user_data);
+
    // Parameters for the whisper_full() function
    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
    // whisper_full_default_params()
@ -315,6 +325,10 @@ extern "C" {
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
+
+        // called by each decoder to filter obtained logits
+        whisper_logits_filter_callback logits_filter_callback;
+        void * logits_filter_callback_user_data;
    };

    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
Author	SHA1	Message	Date
Georgi Gerganov	ad1389003d	release : v1.2.1	2023-02-28 22:29:12 +02:00
FlippFuzz	f420de1322	make : add "-mcpu=native" when building for aarch64 (#532 )	2023-02-27 21:04:16 +02:00
Aaron Pham	d176160f6f	readme : add pybind11 bindings (#538 )	2023-02-27 21:02:11 +02:00
Georgi Gerganov	ca21f7ab16	readme : add cython bindings (#9 )	2023-02-24 08:46:06 +02:00
Georgi Gerganov	373043cabe	whisper : zero-initialize some more context variables Just in case	2023-02-21 19:00:42 +02:00
Finn Voorhees	fb4d0d470f	whisper : fix uninitialized exp_n_audio_ctx	2023-02-21 18:58:08 +02:00
Georgi Gerganov	0d229163bb	whisper : add API for applying custom logits filters during decoding	2023-02-19 18:35:01 +02:00