wip ignore

yt-wsp.sh : add unique filename generation (#495 )
Co-authored-by: genevera <genevera@noreply.users.github.com>
2025-06-27 18:28:42 +00:00 · 2023-02-15 19:11:12 +02:00 · 2023-02-14 20:12:51 +02:00 · 2023-02-14 20:04:03 +02:00 · 2023-02-11 17:35:33 +02:00 · 2023-02-11 09:13:32 +02:00
63 changed files with 5409 additions and 2006 deletions
--- a/.github/workflows/bindings.yml
+++ b/.github/workflows/bindings.yml
@ -0,0 +1,22 @@
 name: Bindings Tests
 on:
  push:
    paths:
      - bindings/go/**
      - whisper.h
  pull_request:
    paths:
      - bindings/go/**
      - whisper.h
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/setup-go@v3
        with:
          go-version: '^1.19'
      - uses: actions/checkout@v1
      - run: |
          cd bindings/go
          make test
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,267 +1,267 @@
 name: CI
-on: [push]
+on: [push, pull_request]
 jobs:
-    ubuntu-latest:
+  ubuntu-latest:
-        runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                  sudo apt-get update
+          sudo apt-get update
-                  sudo apt-get install build-essential
+          sudo apt-get install build-essential
-                  sudo apt-get install libsdl2-dev
+          sudo apt-get install libsdl2-dev
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                make
+          make
-                make stream
+          make stream
-    macOS-latest:
+  macOS-latest:
-        runs-on: macOS-latest
+    runs-on: macOS-latest
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                  brew update
+          brew update
-                  brew install sdl2
+          brew install sdl2
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                make
+          make
-                make stream
+          make stream
-    ubuntu-latest-gcc:
+  ubuntu-latest-gcc:
-        runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                build: [Debug, Release]
+        build: [Debug, Release]
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                  sudo apt-get update
+          sudo apt-get update
-                  sudo apt-get install build-essential
+          sudo apt-get install build-essential
-                  sudo apt-get install cmake
+          sudo apt-get install cmake
-                  sudo apt-get install libsdl2-dev
+          sudo apt-get install libsdl2-dev
-            - name: Configure
+      - name: Configure
-              run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                make
+          make
-                ctest -L gh --output-on-failure
+          ctest -L gh --output-on-failure
-    ubuntu-latest-clang:
+  ubuntu-latest-clang:
-        runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                build: [Debug, Release]
+        build: [Debug, Release]
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                  sudo apt-get update
+          sudo apt-get update
-                  sudo apt-get install build-essential
+          sudo apt-get install build-essential
-                  sudo apt-get install cmake
+          sudo apt-get install cmake
-                  sudo apt-get install libsdl2-dev
+          sudo apt-get install libsdl2-dev
-            - name: Configure
+      - name: Configure
-              run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                make
+          make
-                ctest -L gh --output-on-failure
+          ctest -L gh --output-on-failure
-    ubuntu-latest-gcc-sanitized:
+  ubuntu-latest-gcc-sanitized:
-        runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                  sudo apt-get update
+          sudo apt-get update
-                  sudo apt-get install build-essential
+          sudo apt-get install build-essential
-                  sudo apt-get install cmake
+          sudo apt-get install cmake
-            - name: Configure
+      - name: Configure
-              run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
+        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                make
+          make
-                ctest -L gh --output-on-failure
+          ctest -L gh --output-on-failure
-    windows:
+  windows:
-        runs-on: windows-latest
+    runs-on: windows-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                build: [Release]
+        build: [Release]
-                arch: [Win32, x64]
+        arch: [Win32, x64]
-                sdl2: [ON]
+        sdl2: [ON]
-                include:
+        include:
-                  - arch: Win32
+          - arch: Win32
-                    s2arc: x86
+            s2arc: x86
-                  - arch: x64
+          - arch: x64
-                    s2arc: x64
+            s2arc: x64
-                  - sdl2: ON
+          - sdl2: ON
-                    s2ver: 2.26.0
+            s2ver: 2.26.0
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Add msbuild to PATH
+      - name: Add msbuild to PATH
-              uses: microsoft/setup-msbuild@v1
+        uses: microsoft/setup-msbuild@v1
-            - name: Fetch SDL2 and set SDL2_DIR
+      - name: Fetch SDL2 and set SDL2_DIR
-              if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON'
-              run: |
+        run: |
-                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
-                7z x sdl2.zip
+          7z x sdl2.zip
-                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
-            - name: Configure
+      - name: Configure
-              run: >
+        run: >
-                cmake -S . -B ./build -A ${{ matrix.arch }}
+          cmake -S . -B ./build -A ${{ matrix.arch }}
-                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                cd ./build
+          cd ./build
-                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-            - name: Copy SDL2.dll
+      - name: Copy SDL2.dll
-              if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON'
-              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
-            - name: Upload binaries
+      - name: Upload binaries
-              if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON'
-              uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v1
-              with:
+        with:
-                name: whisper-bin-${{ matrix.arch }}
+          name: whisper-bin-${{ matrix.arch }}
-                path: build/bin/${{ matrix.build }}
+          path: build/bin/${{ matrix.build }}
-    windows-blas:
+  windows-blas:
-        runs-on: windows-latest
+    runs-on: windows-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                build: [Release]
+        build: [Release]
-                arch: [Win32, x64]
+        arch: [Win32, x64]
-                blas: [ON]
+        blas: [ON]
-                sdl2: [ON]
+        sdl2: [ON]
-                include:
+        include:
-                  - arch: Win32
+          - arch: Win32
-                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-                    s2arc: x86
+            s2arc: x86
-                  - arch: x64
+          - arch: x64
-                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-                    s2arc: x64
+            s2arc: x64
-                  - sdl2: ON
+          - sdl2: ON
-                    s2ver: 2.26.0
+            s2ver: 2.26.0
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Add msbuild to PATH
+      - name: Add msbuild to PATH
-              uses: microsoft/setup-msbuild@v1
+        uses: microsoft/setup-msbuild@v1
-            - name: Fetch OpenBLAS
+      - name: Fetch OpenBLAS
-              if: matrix.blas == 'ON'
+        if: matrix.blas == 'ON'
-              run: |
+        run: |
-                C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-                7z x blas.zip -oblas -y
+          7z x blas.zip -oblas -y
-                copy blas/include/cblas.h .
+          copy blas/include/cblas.h .
-                copy blas/include/openblas_config.h .
+          copy blas/include/openblas_config.h .
-                echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
-            - name: Fetch SDL2 and set SDL2_DIR
+      - name: Fetch SDL2 and set SDL2_DIR
-              if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON'
-              run: |
+        run: |
-                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
-                7z x sdl2.zip
+          7z x sdl2.zip
-                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
-            - name: Configure
+      - name: Configure
-              run: >
+        run: >
-                cmake -S . -B ./build -A ${{ matrix.arch }}
+          cmake -S . -B ./build -A ${{ matrix.arch }}
-                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-                -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
+          -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-                -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                cd ./build
+          cd ./build
-                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-            - name: Copy libopenblas.dll
+      - name: Copy libopenblas.dll
-              if: matrix.blas == 'ON'
+        if: matrix.blas == 'ON'
-              run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
-            - name: Copy SDL2.dll
+      - name: Copy SDL2.dll
-              if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON'
-              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
-            - name: Upload binaries
+      - name: Upload binaries
-              if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
+        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
-              uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v1
-              with:
+        with:
-                name: whisper-blas-bin-${{ matrix.arch }}
+          name: whisper-blas-bin-${{ matrix.arch }}
-                path: build/bin/${{ matrix.build }}
+          path: build/bin/${{ matrix.build }}
-    emscripten:
+  emscripten:
-        runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-        strategy:
+    strategy:
-            matrix:
+      matrix:
-                build: [Release]
+        build: [Release]
-        steps:
+    steps:
-            - name: Clone
+      - name: Clone
-              uses: actions/checkout@v1
+        uses: actions/checkout@v1
-            - name: Dependencies
+      - name: Dependencies
-              run: |
+        run: |
-                wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-                tar -xvf master.tar.gz
+          tar -xvf master.tar.gz
-                emsdk-master/emsdk update
+          emsdk-master/emsdk update
-                emsdk-master/emsdk install latest
+          emsdk-master/emsdk install latest
-                emsdk-master/emsdk activate latest
+          emsdk-master/emsdk activate latest
-            - name: Configure
+      - name: Configure
-              run: echo "tmp"
+        run: echo "tmp"
-            - name: Build
+      - name: Build
-              run: |
+        run: |
-                pushd emsdk-master
+          pushd emsdk-master
-                source ./emsdk_env.sh
+          source ./emsdk_env.sh
-                popd
+          popd
-                emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-                make
+          make
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@ -0,0 +1,48 @@
 name: Examples Tests
 on:
  push:
    paths:
      - examples/addon.node/**
      - whisper.h
  pull_request:
    paths:
      - examples/addon.node/**
      - whisper.h
 jobs:
  addon_node-ubuntu-latest:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        node-version: [ 16.x, 18.x ]
    steps:
      - name: Clone
        uses: actions/checkout@v1
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
          sudo apt-get install cmake
          sudo apt-get install libsdl2-dev
      - name: Use Node.js ${{ matrix.node-version }}
        uses: actions/setup-node@v1
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
      - name: Install package.json dependencies
        working-directory: ./examples/addon.node
        run: npm install
      - name: Compile addon.node
        run: npx cmake-js compile -T whisper-addon -B Release
      - name: Download test model
        run: |
          bash ./models/download-ggml-model.sh base.en
      - name: Test
        run: |
          cd examples/addon.node
          npm run test
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 *.o
 *.a
 .cache/
 .vs/
 .vscode/
@ -8,6 +9,7 @@ build/
 build-em/
 build-debug/
 build-release/
 build-static/
 build-sanitize-addr/
 build-sanitize-thread/
@ -17,7 +19,9 @@ build-sanitize-thread/
 /talk
 /bench
 arm_neon.h
 sync.sh
 libwhisper.a
 libwhisper.so
 compile_commands.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.0.4)
+project(whisper.cpp VERSION 1.2.0)
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -226,10 +226,13 @@ target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )
 set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
 install(TARGETS ${TARGET}
    LIBRARY DESTINATION lib
    ARCHIVE DESTINATION lib/static
    RUNTIME DESTINATION bin
    PUBLIC_HEADER DESTINATION include
    )
 #
@ -242,7 +245,7 @@ add_subdirectory(bindings)
 # programs, examples and tests
 #
-if (WHISPER_BUILD_TESTS)
+if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    enable_testing()
    add_subdirectory(tests)
 endif ()
--- a/10
+++ b/10
@ -115,11 +115,15 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
-ifeq ($(UNAME_M),ppc64le)
+ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
 		CFLAGS += -mpower9-vector
 	endif
 	# Require c++23's std::byteswap for big-endian support.
 	ifeq ($(UNAME_M),ppc64)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
@ -133,8 +137,8 @@ ifdef WHISPER_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_GPROF
-	CFLAGS  += -pg
+	CFLAGS   += -pg
-	CXXFLAGS  += -pg
+	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-[Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -13,7 +13,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- Low memory usage (Flash Attention + Flash Forward)
+- Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
@ -71,7 +71,7 @@ Now build the [main](examples/main) example and transcribe an audio file like th
 make
 # transcribe an audio file
-./main -f input.wav
+./main -f samples/jfk.wav
 ```
 ---
@ -89,27 +89,38 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...
 options:
-  -h,       --help          [default] show this help message and exit
+  -h,        --help              [default] show this help message and exit
-  -t N,     --threads N     [4      ] number of threads to use during computation
+  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,     --processors N  [1      ] number of processors to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,    --offset-n N    [0      ] segment index offset
+  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -tr,      --translate     [false  ] translate from source language to english
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -otxt,    --output-txt    [false  ] output result in a text file
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -osrt,    --output-srt    [false  ] output result in a srt file
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
-  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -tr,       --translate         [false  ] translate from source language to english
-  -ps,      --print-special [false  ] print special tokens
+  -di,       --diarize           [false  ] stereo audio diarization
-  -pc,      --print-colors  [false  ] print colors
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -nt,      --no-timestamps [true   ] do not print timestamps
+  -otxt,     --output-txt        [false  ] output result in a text file
-  -l LANG,  --language LANG [en     ] spoken language
+  -ovtt,     --output-vtt        [false  ] output result in a vtt file
-  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -osrt,     --output-srt        [false  ] output result in a srt file
-  -f FNAME, --file FNAME    [       ] input WAV file path
+  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
  -pp,       --print-progress    [false  ] print progress
  -nt,       --no-timestamps     [true   ] do not print timestamps
  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -128,7 +139,8 @@ Running base.en on all samples in ./samples ...
 [+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
 ----------------------------------------------
-whisper_model_load: loading model from 'models/ggml-base.en.bin'
+whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
 whisper_model_load: loading model
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 512
@ -141,13 +153,14 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
 whisper_model_load: mem required  =  215.00 MB (+    6.00 MB per decoder)
 whisper_model_load: kv self size  =    5.25 MB
 whisper_model_load: kv cross size =   17.58 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: mem_required  =  506.00 MB
+whisper_model_load: model ctx     =  140.60 MB
 whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB
-system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -155,12 +168,13 @@ main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 proc
 [00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
-whisper_print_timings:     load time =   105.91 ms
+whisper_print_timings:     fallbacks =   0 p /   0 h
-whisper_print_timings:      mel time =    24.62 ms
+whisper_print_timings:     load time =   113.81 ms
-whisper_print_timings:   sample time =     3.63 ms
+whisper_print_timings:      mel time =    15.40 ms
-whisper_print_timings:   encode time =   324.71 ms / 54.12 ms per layer
+whisper_print_timings:   sample time =    11.58 ms /    27 runs (    0.43 ms per run)
-whisper_print_timings:   decode time =    83.58 ms / 13.93 ms per layer
+whisper_print_timings:   encode time =   266.60 ms /     1 runs (  266.60 ms per run)
-whisper_print_timings:    total time =   542.81 ms
+whisper_print_timings:   decode time =    66.11 ms /    27 runs (    2.45 ms per run)
 whisper_print_timings:    total time =   476.31 ms
 ```
 The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
@ -203,26 +217,16 @@ make large
 | Model  | Disk   | Mem     | SHA                                        |
 | ---    | ---    | ---     | ---                                        |
-| tiny   |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| tiny   |  75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| base   | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| small  | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Limitations
 - Inference only
- No GPU support
+- No GPU support (yet)
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Another example
@ -235,7 +239,8 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 ```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
-whisper_model_load: loading model from 'models/ggml-medium.en.bin'
+whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
 whisper_model_load: loading model
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 1024
@ -248,55 +253,60 @@ whisper_model_load: n_text_layer  = 24
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 4
-whisper_model_load: mem_required  = 2610.00 MB
+whisper_model_load: mem required  = 1720.00 MB (+   43.00 MB per decoder)
 whisper_model_load: kv self size  =   42.00 MB
 whisper_model_load: kv cross size =  140.62 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 1644.97 MB
+whisper_model_load: model ctx     = 1462.35 MB
-whisper_model_load: memory size =   182.62 MB
+whisper_model_load: model size    = 1462.12 MB
 whisper_model_load: model size  =  1462.12 MB
-main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-[00:00.000 --> 00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
+main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 [00:08.000 --> 00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
 [00:17.000 --> 00:23.000]   A short time later, debris was seen falling from the skies above Texas.
 [00:23.000 --> 00:29.000]   The Columbia's lost. There are no survivors.
 [00:29.000 --> 00:32.000]   On board was a crew of seven.
 [00:32.000 --> 00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
 [00:39.000 --> 00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
 [00:48.000 --> 00:52.000]   a colonel in the Israeli Air Force.
 [00:52.000 --> 00:58.000]   These men and women assumed great risk in the service to all humanity.
 [00:58.000 --> 01:03.000]   In an age when space flight has come to seem almost routine,
 [01:03.000 --> 01:07.000]   it is easy to overlook the dangers of travel by rocket
 [01:07.000 --> 01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
 [01:12.000 --> 01:18.000]   These astronauts knew the dangers, and they faced them willingly,
 [01:18.000 --> 01:23.000]   knowing they had a high and noble purpose in life.
 [01:23.000 --> 01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
 [01:31.000 --> 01:36.000]   All Americans today are thinking as well of the families of these men and women
 [01:36.000 --> 01:40.000]   who have been given this sudden shock and grief.
 [01:40.000 --> 01:45.000]   You're not alone. Our entire nation grieves with you,
 [01:45.000 --> 01:52.000]   and those you love will always have the respect and gratitude of this country.
 [01:52.000 --> 01:56.000]   The cause in which they died will continue.
 [01:56.000 --> 02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
 [02:04.000 --> 02:11.000]   and the longing to understand. Our journey into space will go on.
 [02:11.000 --> 02:16.000]   In the skies today, we saw destruction and tragedy.
 [02:16.000 --> 02:22.000]   Yet farther than we can see, there is comfort and hope.
 [02:22.000 --> 02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
 [02:29.000 --> 02:35.000]   who created all these. He who brings out the starry hosts one by one
 [02:35.000 --> 02:39.000]   and calls them each by name."
 [02:39.000 --> 02:46.000]   Because of His great power and mighty strength, not one of them is missing.
 [02:46.000 --> 02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
 [02:55.000 --> 03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
 [03:01.000 --> 03:05.000]   yet we can pray that all are safely home.
 [03:05.000 --> 03:13.000]   May God bless the grieving families, and may God continue to bless America.
 [03:13.000 --> 03:41.000]   Audio
-whisper_print_timings:     load time =   575.92 ms
+[00:00:00.000 --> 00:00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
-whisper_print_timings:      mel time =   230.60 ms
+[00:00:08.000 --> 00:00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
-whisper_print_timings:   sample time =    73.19 ms
+[00:00:17.000 --> 00:00:23.000]   A short time later, debris was seen falling from the skies above Texas.
-whisper_print_timings:   encode time = 19552.61 ms / 814.69 ms per layer
+[00:00:23.000 --> 00:00:29.000]   The Columbia's lost. There are no survivors.
-whisper_print_timings:   decode time = 13249.96 ms / 552.08 ms per layer
+[00:00:29.000 --> 00:00:32.000]   On board was a crew of seven.
-whisper_print_timings:    total time = 33686.27 ms
+[00:00:32.000 --> 00:00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
 [00:00:39.000 --> 00:00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
 [00:00:48.000 --> 00:00:52.000]   a colonel in the Israeli Air Force.
 [00:00:52.000 --> 00:00:58.000]   These men and women assumed great risk in the service to all humanity.
 [00:00:58.000 --> 00:01:03.000]   In an age when space flight has come to seem almost routine,
 [00:01:03.000 --> 00:01:07.000]   it is easy to overlook the dangers of travel by rocket
 [00:01:07.000 --> 00:01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
 [00:01:12.000 --> 00:01:18.000]   These astronauts knew the dangers, and they faced them willingly,
 [00:01:18.000 --> 00:01:23.000]   knowing they had a high and noble purpose in life.
 [00:01:23.000 --> 00:01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
 [00:01:31.000 --> 00:01:36.000]   All Americans today are thinking as well of the families of these men and women
 [00:01:36.000 --> 00:01:40.000]   who have been given this sudden shock and grief.
 [00:01:40.000 --> 00:01:45.000]   You're not alone. Our entire nation grieves with you,
 [00:01:45.000 --> 00:01:52.000]   and those you love will always have the respect and gratitude of this country.
 [00:01:52.000 --> 00:01:56.000]   The cause in which they died will continue.
 [00:01:56.000 --> 00:02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
 [00:02:04.000 --> 00:02:11.000]   and the longing to understand. Our journey into space will go on.
 [00:02:11.000 --> 00:02:16.000]   In the skies today, we saw destruction and tragedy.
 [00:02:16.000 --> 00:02:22.000]   Yet farther than we can see, there is comfort and hope.
 [00:02:22.000 --> 00:02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
 [00:02:29.000 --> 00:02:35.000]   who created all these. He who brings out the starry hosts one by one
 [00:02:35.000 --> 00:02:39.000]   and calls them each by name."
 [00:02:39.000 --> 00:02:46.000]   Because of His great power and mighty strength, not one of them is missing.
 [00:02:46.000 --> 00:02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
 [00:02:55.000 --> 00:03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
 [00:03:01.000 --> 00:03:05.000]   yet we can pray that all are safely home.
 [00:03:05.000 --> 00:03:13.000]   May God bless the grieving families, and may God continue to bless America.
 [00:03:13.000 --> 00:03:19.000]   [Silence]
 whisper_print_timings:     fallbacks =   1 p /   0 h
 whisper_print_timings:     load time =   569.03 ms
 whisper_print_timings:      mel time =   146.85 ms
 whisper_print_timings:   sample time =   238.66 ms /   553 runs (    0.43 ms per run)
 whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per run)
 whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
 whisper_print_timings:    total time = 32733.52 ms
 ```
 </details>
@ -307,6 +317,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 ```java
 make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@ -321,14 +332,14 @@ to highlight words with high or low confidence:
 ## Controlling the length of the generated text segments (experimental)
-For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: 
+For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -352,7 +363,7 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -454,6 +465,9 @@ in [models](models).
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
 - [X] .NET:
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
 - [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
 ## Examples
--- a/bindings/go/examples/go-whisper/color.go
+++ b/bindings/go/examples/go-whisper/color.go
@ -0,0 +1,22 @@
 package main
 import "fmt"
 ///////////////////////////////////////////////////////////////////////////////
 // CONSTANTS
 const (
 	Reset     = "\033[0m"
 	RGBPrefix = "\033[38;5;" // followed by RGB values in decimal format separated by colons
 	RGBSuffix = "m"
 )
 ///////////////////////////////////////////////////////////////////////////////
 // PUBLIC METHODS
 // Colorize text with RGB values, from 0 to 23
 func Colorize(text string, v int) string {
 	// https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
 	// Grayscale colors are in the range 232-255
 	return RGBPrefix + fmt.Sprint(v%24+232) + RGBSuffix + text + Reset
 }
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@ -2,6 +2,12 @@ package main
 import (
 	"flag"
 	"fmt"
 	"strings"
 	"time"
 	// Packages
 	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 )
 ///////////////////////////////////////////////////////////////////////////////
@ -42,6 +48,26 @@ func (flags *Flags) GetLanguage() string {
 	return flags.Lookup("language").Value.String()
 }
 func (flags *Flags) IsTranslate() bool {
 	return flags.Lookup("translate").Value.(flag.Getter).Get().(bool)
 }
 func (flags *Flags) GetOffset() time.Duration {
 	return flags.Lookup("offset").Value.(flag.Getter).Get().(time.Duration)
 }
 func (flags *Flags) GetDuration() time.Duration {
 	return flags.Lookup("duration").Value.(flag.Getter).Get().(time.Duration)
 }
 func (flags *Flags) GetThreads() uint {
 	return flags.Lookup("threads").Value.(flag.Getter).Get().(uint)
 }
 func (flags *Flags) GetOut() string {
 	return strings.ToLower(flags.Lookup("out").Value.String())
 }
 func (flags *Flags) IsSpeedup() bool {
 	return flags.Lookup("speedup").Value.String() == "true"
 }
@ -50,12 +76,81 @@ func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }
 func (flags *Flags) IsColorize() bool {
 	return flags.Lookup("colorize").Value.String() == "true"
 }
 func (flags *Flags) GetMaxLen() uint {
 	return flags.Lookup("max-len").Value.(flag.Getter).Get().(uint)
 }
 func (flags *Flags) GetMaxTokens() uint {
 	return flags.Lookup("max-tokens").Value.(flag.Getter).Get().(uint)
 }
 func (flags *Flags) GetWordThreshold() float32 {
 	return float32(flags.Lookup("word-thold").Value.(flag.Getter).Get().(float64))
 }
 func (flags *Flags) SetParams(context whisper.Context) error {
 	if lang := flags.GetLanguage(); lang != "" && lang != "auto" {
 		fmt.Fprintf(flags.Output(), "Setting language to %q\n", lang)
 		if err := context.SetLanguage(lang); err != nil {
 			return err
 		}
 	}
 	if flags.IsTranslate() && context.IsMultilingual() {
 		fmt.Fprintf(flags.Output(), "Setting translate to true\n")
 		context.SetTranslate(true)
 	}
 	if offset := flags.GetOffset(); offset != 0 {
 		fmt.Fprintf(flags.Output(), "Setting offset to %v\n", offset)
 		context.SetOffset(offset)
 	}
 	if duration := flags.GetDuration(); duration != 0 {
 		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
 		context.SetDuration(duration)
 	}
 	if flags.IsSpeedup() {
 		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
 		context.SetSpeedup(true)
 	}
 	if threads := flags.GetThreads(); threads != 0 {
 		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
 		context.SetThreads(threads)
 	}
 	if max_len := flags.GetMaxLen(); max_len != 0 {
 		fmt.Fprintf(flags.Output(), "Setting max_segment_length to %d\n", max_len)
 		context.SetMaxSegmentLength(max_len)
 	}
 	if max_tokens := flags.GetMaxTokens(); max_tokens != 0 {
 		fmt.Fprintf(flags.Output(), "Setting max_tokens to %d\n", max_tokens)
 		context.SetMaxTokensPerSegment(max_tokens)
 	}
 	if word_threshold := flags.GetWordThreshold(); word_threshold != 0 {
 		fmt.Fprintf(flags.Output(), "Setting word_threshold to %f\n", word_threshold)
 		context.SetTokenThreshold(word_threshold)
 	}
 	// Return success
 	return nil
 }
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
 func registerFlags(flag *Flags) {
 	flag.String("model", "", "Path to the model file")
-	flag.String("language", "", "Language")
+	flag.String("language", "", "Spoken language")
 	flag.Bool("translate", false, "Translate from source language to english")
 	flag.Duration("offset", 0, "Time offset")
 	flag.Duration("duration", 0, "Duration of audio to process")
 	flag.Uint("threads", 0, "Number of threads to use")
 	flag.Bool("speedup", false, "Enable speedup")
 	flag.Uint("max-len", 0, "Maximum segment length in characters")
 	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
 	flag.Float64("word-thold", 0, "Maximum segment score")
 	flag.Bool("tokens", false, "Display tokens")
 	flag.Bool("colorize", false, "Colorize tokens")
 	flag.String("out", "", "Output format (srt, none or leave as empty string)")
 }
--- a/bindings/go/examples/go-whisper/main.go
+++ b/bindings/go/examples/go-whisper/main.go
@ -35,8 +35,7 @@ func main() {
 	// Process files
 	for _, filename := range flags.Args() {
-		fmt.Println("Processing", filename)
+		if err := Process(model, filename, flags); err != nil {
 		if err := Process(model, filename, flags.GetLanguage(), flags.IsSpeedup(), flags.IsTokens()); err != nil {
 			fmt.Fprintln(os.Stderr, err)
 			continue
 		}
--- a/bindings/go/examples/go-whisper/process.go
+++ b/bindings/go/examples/go-whisper/process.go
@ -11,7 +11,7 @@ import (
 	wav "github.com/go-audio/wav"
 )
-func Process(model whisper.Model, path string, lang string, speedup, tokens bool) error {
+func Process(model whisper.Model, path string, flags *Flags) error {
 	var data []float32
 	// Create processing context
@ -20,14 +20,22 @@ func Process(model whisper.Model, path string, lang string, speedup, tokens bool
 		return err
 	}
 	// Set the parameters
 	if err := flags.SetParams(context); err != nil {
 		return err
 	}
 	fmt.Printf("\n%s\n", context.SystemInfo())
 	// Open the file
 	fmt.Fprintf(flags.Output(), "Loading %q\n", path)
 	fh, err := os.Open(path)
 	if err != nil {
 		return err
 	}
 	defer fh.Close()
-	// Decode the WAV file
+	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
 	if buf, err := dec.FullPCMBuffer(); err != nil {
 		return err
@ -39,42 +47,86 @@ func Process(model whisper.Model, path string, lang string, speedup, tokens bool
 		data = buf.AsFloat32Buffer().Data
 	}
-	// Set the parameters
+	// Segment callback when -tokens is specified
 	var cb whisper.SegmentCallback
-	if lang != "" {
+	if flags.IsTokens() {
 		if err := context.SetLanguage(lang); err != nil {
 			return err
 		}
 	}
 	if speedup {
 		context.SetSpeedup(true)
 	}
 	if tokens {
 		cb = func(segment whisper.Segment) {
-			fmt.Printf("%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
+			fmt.Fprintf(flags.Output(), "%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
 			for _, token := range segment.Tokens {
-				fmt.Printf("%q ", token.Text)
+				if flags.IsColorize() && context.IsText(token) {
 					fmt.Fprint(flags.Output(), Colorize(token.Text, int(token.P*24.0)), " ")
 				} else {
 					fmt.Fprint(flags.Output(), token.Text, " ")
 				}
 			}
-			fmt.Println("")
+			fmt.Fprintln(flags.Output(), "")
 			fmt.Fprintln(flags.Output(), "")
 		}
 	}
 	// Process the data
 	fmt.Fprintf(flags.Output(), "  ...processing %q\n", path)
 	context.ResetTimings()
 	if err := context.Process(data, cb); err != nil {
 		return err
 	}
 	context.PrintTimings()
 	// Print out the results
 	switch {
 	case flags.GetOut() == "srt":
 		return OutputSRT(os.Stdout, context)
 	case flags.GetOut() == "none":
 		return nil
 	default:
 		return Output(os.Stdout, context, flags.IsColorize())
 	}
 }
 // Output text as SRT file
 func OutputSRT(w io.Writer, context whisper.Context) error {
 	n := 1
 	for {
 		segment, err := context.NextSegment()
 		if err == io.EOF {
-			break
+			return nil
 		} else if err != nil {
 			return err
 		}
-		fmt.Printf("[%6s->%6s] %s\n", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond), segment.Text)
+		fmt.Fprintln(w, n)
 		fmt.Fprintln(w, srtTimestamp(segment.Start), " --> ", srtTimestamp(segment.End))
 		fmt.Fprintln(w, segment.Text)
 		fmt.Fprintln(w, "")
 		n++
 	}
-
+}
-	// Return success
+
-	return nil
+// Output text to terminal
 func Output(w io.Writer, context whisper.Context, colorize bool) error {
 	for {
 		segment, err := context.NextSegment()
 		if err == io.EOF {
 			return nil
 		} else if err != nil {
 			return err
 		}
 		fmt.Fprintf(w, "[%6s->%6s]", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
 		if colorize {
 			for _, token := range segment.Tokens {
 				if !context.IsText(token) {
 					continue
 				}
 				fmt.Fprint(w, " ", Colorize(token.Text, int(token.P*24.0)))
 			}
 			fmt.Fprint(w, "\n")
 		} else {
 			fmt.Fprintln(w, " ", segment.Text)
 		}
 	}
 }
 // Return srtTimestamp
 func srtTimestamp(t time.Duration) string {
 	return fmt.Sprintf("%02d:%02d:%02d,%03d", t/time.Hour, (t%time.Hour)/time.Minute, (t%time.Minute)/time.Second, (t%time.Second)/time.Millisecond)
 }
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -47,7 +47,12 @@ func (p *Params) SetSpeedup(v bool) {
 	p.speed_up = toBool(v)
 }
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
 		p.language = nil
 		return nil
 	}
 	str := C.whisper_lang_str(C.int(lang))
 	if str == nil {
 		return ErrInvalidLanguage
@ -57,6 +62,7 @@ func (p *Params) SetLanguage(lang int) error {
 	return nil
 }
 // Get language id
 func (p *Params) Language() int {
 	if p.language == nil {
 		return -1
@ -64,18 +70,46 @@ func (p *Params) Language() int {
 	return int(C.whisper_lang_id(p.language))
 }
 // Threads available
 func (p *Params) Threads() int {
 	return int(p.n_threads)
 }
 // Set number of threads to use
 func (p *Params) SetThreads(threads int) {
 	p.n_threads = C.int(threads)
 }
 // Set start offset in ms
 func (p *Params) SetOffset(offset_ms int) {
 	p.offset_ms = C.int(offset_ms)
 }
 // Set audio duration to process in ms
 func (p *Params) SetDuration(duration_ms int) {
 	p.duration_ms = C.int(duration_ms)
 }
 // Set timestamp token probability threshold (~0.01)
 func (p *Params) SetTokenThreshold(t float32) {
 	p.thold_pt = C.float(t)
 }
 // Set timestamp token sum probability threshold (~0.01)
 func (p *Params) SetTokenSumThreshold(t float32) {
 	p.thold_ptsum = C.float(t)
 }
 // Set max segment length in characters
 func (p *Params) SetMaxSegmentLength(n int) {
 	p.max_len = C.int(n)
 }
 // Set max tokens per segment (0 = no limit)
 func (p *Params) SetMaxTokensPerSegment(n int) {
 	p.max_tokens = C.int(n)
 }
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
--- a/bindings/go/pkg/whisper/consts.go
+++ b/bindings/go/pkg/whisper/consts.go
@ -11,10 +11,11 @@ import (
 // ERRORS
 var (
-	ErrUnableToLoadModel   = errors.New("unable to load model")
+	ErrUnableToLoadModel    = errors.New("unable to load model")
-	ErrInternalAppError    = errors.New("internal application error")
+	ErrInternalAppError     = errors.New("internal application error")
-	ErrProcessingFailed    = errors.New("processing failed")
+	ErrProcessingFailed     = errors.New("processing failed")
-	ErrUnsupportedLanguage = errors.New("unsupported language")
+	ErrUnsupportedLanguage  = errors.New("unsupported language")
 	ErrModelNotMultilingual = errors.New("model is not multilingual")
 )
 ///////////////////////////////////////////////////////////////////////////////
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -1,7 +1,9 @@
 package whisper
 import (
 	"fmt"
 	"io"
 	"runtime"
 	"strings"
 	"time"
@ -24,7 +26,7 @@ var _ Context = (*context)(nil)
 ///////////////////////////////////////////////////////////////////////////////
 // LIFECYCLE
-func NewContext(model *model, params whisper.Params) (Context, error) {
+func newContext(model *model, params whisper.Params) (Context, error) {
 	context := new(context)
 	context.model = model
 	context.params = params
@ -41,7 +43,13 @@ func (context *context) SetLanguage(lang string) error {
 	if context.model.ctx == nil {
 		return ErrInternalAppError
 	}
-	if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
+	if !context.model.IsMultilingual() {
 		return ErrModelNotMultilingual
 	}
 	if lang == "auto" {
 		context.params.SetLanguage(-1)
 	} else if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
 		return ErrUnsupportedLanguage
 	} else if err := context.params.SetLanguage(id); err != nil {
 		return err
@ -50,16 +58,94 @@ func (context *context) SetLanguage(lang string) error {
 	return nil
 }
 func (context *context) IsMultilingual() bool {
 	return context.model.IsMultilingual()
 }
 // Get language
 func (context *context) Language() string {
 	id := context.params.Language()
 	if id == -1 {
 		return "auto"
 	}
 	return whisper.Whisper_lang_str(context.params.Language())
 }
 // Set translate flag
 func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
 }
 // Set speedup flag
 func (context *context) SetSpeedup(v bool) {
 	context.params.SetSpeedup(v)
 }
 // Set number of threads to use
 func (context *context) SetThreads(v uint) {
 	context.params.SetThreads(int(v))
 }
 // Set time offset
 func (context *context) SetOffset(v time.Duration) {
 	context.params.SetOffset(int(v.Milliseconds()))
 }
 // Set duration of audio to process
 func (context *context) SetDuration(v time.Duration) {
 	context.params.SetOffset(int(v.Milliseconds()))
 }
 // Set timestamp token probability threshold (~0.01)
 func (context *context) SetTokenThreshold(t float32) {
 	context.params.SetTokenThreshold(t)
 }
 // Set timestamp token sum probability threshold (~0.01)
 func (context *context) SetTokenSumThreshold(t float32) {
 	context.params.SetTokenSumThreshold(t)
 }
 // Set max segment length in characters
 func (context *context) SetMaxSegmentLength(n uint) {
 	context.params.SetMaxSegmentLength(int(n))
 }
 // Set max tokens per segment (0 = no limit)
 func (context *context) SetMaxTokensPerSegment(n uint) {
 	context.params.SetMaxTokensPerSegment(int(n))
 }
 // ResetTimings resets the mode timings. Should be called before processing
 func (context *context) ResetTimings() {
 	context.model.ctx.Whisper_reset_timings()
 }
 // PrintTimings prints the model timings to stdout.
 func (context *context) PrintTimings() {
 	context.model.ctx.Whisper_print_timings()
 }
 // SystemInfo returns the system information
 func (context *context) SystemInfo() string {
 	return fmt.Sprintf("system_info: n_threads = %d / %d | %s\n",
 		context.params.Threads(),
 		runtime.NumCPU(),
 		whisper.Whisper_print_system_info(),
 	)
 }
 // Use mel data at offset_ms to try and auto-detect the spoken language
 // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
 // Returns the probabilities of all languages.
 func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]float32, error) {
 	langProbs, err := context.model.ctx.Whisper_lang_auto_detect(offset_ms, n_threads)
 	if err != nil {
 		return nil, err
 	}
 	return langProbs, nil
 }
 // Process new sample data and return any errors
 func (context *context) Process(data []float32, cb SegmentCallback) error {
 	if context.model.ctx == nil {
@ -119,6 +205,65 @@ func (context *context) NextSegment() (Segment, error) {
 	return result, nil
 }
 // Test for text tokens
 func (context *context) IsText(t Token) bool {
 	switch {
 	case context.IsBEG(t):
 		return false
 	case context.IsSOT(t):
 		return false
 	case whisper.Token(t.Id) >= context.model.ctx.Whisper_token_eot():
 		return false
 	case context.IsPREV(t):
 		return false
 	case context.IsSOLM(t):
 		return false
 	case context.IsNOT(t):
 		return false
 	default:
 		return true
 	}
 }
 // Test for "begin" token
 func (context *context) IsBEG(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_beg()
 }
 // Test for "start of transcription" token
 func (context *context) IsSOT(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_sot()
 }
 // Test for "end of transcription" token
 func (context *context) IsEOT(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_eot()
 }
 // Test for "start of prev" token
 func (context *context) IsPREV(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_prev()
 }
 // Test for "start of lm" token
 func (context *context) IsSOLM(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_solm()
 }
 // Test for "No timestamps" token
 func (context *context) IsNOT(t Token) bool {
 	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_not()
 }
 // Test for token associated with a specific language
 func (context *context) IsLANG(t Token, lang string) bool {
 	if id := context.model.ctx.Whisper_lang_id(lang); id >= 0 {
 		return whisper.Token(t.Id) == context.model.ctx.Whisper_token_lang(id)
 	} else {
 		return false
 	}
 }
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -20,15 +20,28 @@ type Model interface {
 	// Return a new speech-to-text context.
 	NewContext() (Context, error)
 	// Return true if the model is multilingual.
 	IsMultilingual() bool
 	// Return all languages supported.
 	Languages() []string
 }
 // Context is the speach recognition context.
 type Context interface {
-	SetLanguage(string) error // Set the language to use for speech recognition.
+	SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
 	SetTranslate(bool)        // Set translate flag
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
-	SetSpeedup(bool)          // Set speedup flag
+
 	SetOffset(time.Duration)      // Set offset
 	SetDuration(time.Duration)    // Set duration
 	SetThreads(uint)              // Set number of threads to use
 	SetSpeedup(bool)              // Set speedup flag
 	SetTokenThreshold(float32)    // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
 	SetMaxSegmentLength(uint)     // Set max segment length in characters
 	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
@ -38,6 +51,21 @@ type Context interface {
 	// After process is called, return segments until the end of the stream
 	// is reached, when io.EOF is returned.
 	NextSegment() (Segment, error)
 	IsBEG(Token) bool          // Test for "begin" token
 	IsSOT(Token) bool          // Test for "start of transcription" token
 	IsEOT(Token) bool          // Test for "end of transcription" token
 	IsPREV(Token) bool         // Test for "start of prev" token
 	IsSOLM(Token) bool         // Test for "start of lm" token
 	IsNOT(Token) bool          // Test for "No timestamps" token
 	IsLANG(Token, string) bool // Test for token associated with a specific language
 	IsText(Token) bool         // Test for text token
 	// Timings
 	PrintTimings()
 	ResetTimings()
 	SystemInfo() string
 }
 // Segment is the text result of a speech recognition.
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -23,7 +23,7 @@ var _ Model = (*model)(nil)
 ///////////////////////////////////////////////////////////////////////////////
 // LIFECYCLE
-func New(path string) (*model, error) {
+func New(path string) (Model, error) {
 	model := new(model)
 	if _, err := os.Stat(path); err != nil {
 		return nil, err
@ -64,6 +64,11 @@ func (model *model) String() string {
 ///////////////////////////////////////////////////////////////////////////////
 // PUBLIC METHODS
 // Return true if model is multilingual (language and translation options are supported)
 func (model *model) IsMultilingual() bool {
 	return model.ctx.Whisper_is_multilingual() != 0
 }
 // Return all recognized languages. Initially it is set to auto-detect
 func (model *model) Languages() []string {
 	result := make([]string, 0, whisper.Whisper_lang_max_id())
@ -91,5 +96,5 @@ func (model *model) NewContext() (Context, error) {
 	params.SetThreads(runtime.NumCPU())
 	// Return new context
-	return NewContext(model, params)
+	return newContext(model, params)
 }
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -91,7 +91,7 @@ var (
 func Whisper_init(path string) *Context {
 	cPath := C.CString(path)
 	defer C.free(unsafe.Pointer(cPath))
-	if ctx := C.whisper_init(cPath); ctx != nil {
+	if ctx := C.whisper_init_from_file(cPath); ctx != nil {
 		return (*Context)(ctx)
 	} else {
 		return nil
@ -147,16 +147,6 @@ func (ctx *Context) Whisper_decode(tokens []Token, past, threads int) error {
 	}
 }
 // whisper_sample_best() returns the token with the highest probability
 func (ctx *Context) Whisper_sample_best() TokenData {
 	return TokenData(C.whisper_sample_best((*C.struct_whisper_context)(ctx)))
 }
 // whisper_sample_timestamp() returns the most probable timestamp token
 func (ctx *Context) Whisper_sample_timestamp(is_initial bool) TokenData {
 	return TokenData(C.whisper_sample_timestamp((*C.struct_whisper_context)(ctx), C.bool(is_initial)))
 }
 // Convert the provided text into tokens. The tokens pointer must be large enough to hold the resulting tokens.
 // Returns the number of tokens on success
 func (ctx *Context) Whisper_tokenize(text string, tokens []Token) (int, error) {
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -20,7 +20,7 @@ struct whisper_context * g_context;
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_context == nullptr) {
-            g_context = whisper_init(path_model.c_str());
+            g_context = whisper_init_from_file(path_model.c_str());
            if (g_context != nullptr) {
                return true;
            } else {
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.0.4",
+  "version": "1.2.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -24,6 +24,8 @@ if (EMSCRIPTEN)
    add_subdirectory(command.wasm)
    add_subdirectory(talk.wasm)
    add_subdirectory(bench.wasm)
 elseif(CMAKE_JS_VERSION)
    add_subdirectory(addon.node)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
--- a/examples/addon.node/.gitignore
+++ b/examples/addon.node/.gitignore
@ -0,0 +1,3 @@
 .idea
 node_modules
 build
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -0,0 +1,31 @@
 set(TARGET whisper-addon)
 # Base settings
 #==================================================================
 # env var supported by cmake-js
 add_definitions(-DNAPI_VERSION=4)
 include_directories(${CMAKE_JS_INC})
 #==================================================================
 add_library(${TARGET} SHARED ${CMAKE_JS_SRC} addon.cpp)
 set_target_properties(${TARGET} PROPERTIES PREFIX "" SUFFIX ".node")
 include(DefaultTargetOptions)
 # Include N-API wrappers
 #==================================================================
 execute_process(COMMAND node -p "require('node-addon-api').include"
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE NODE_ADDON_API_DIR
        )
 string(REPLACE "\n" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
 string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
 target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================
 target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
    # Generate node.lib
    execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
 endif()
--- a/examples/addon.node/README.md
+++ b/examples/addon.node/README.md
@ -0,0 +1,37 @@
 # addon
 This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
 It can be used as a reference for using the whisper.cpp project in other node projects.
 ## Install
 ```shell
 npm install
 ```
 ## Compile
 Make sure it is in the project root directory and compiled with make-js.
 ```shell
 npx cmake-js compile -T whisper-addon -B Release
 ```
 For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
 > Such as appointing special cmake path:
 > ```shell
 > npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
 > ```
 ## Run
 ```shell
 cd examples/addon.node
 node index.js --language='language' --model='model-path' --fname_inp='file-path'
 ```
 Because this is a simple Demo, only the above parameters are set in the node environment.
 Other parameters can also be specified in the node environment.
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -0,0 +1,15 @@
 const path = require('path');
 const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
 const whisperParamsMock = {
    language: 'en',
    model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
    fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
 };
 describe("Run whisper.node", () => {
    test("it should receive a non-empty value", () => {
        expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
    });
 });
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -0,0 +1,422 @@
 #include <cstdint>
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
 #include "napi.h"
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include "whisper.h"
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
    int32_t offset_n     = 0;
    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
    int32_t max_len      = 0;
    int32_t best_of      = 5;
    int32_t beam_size    = -1;
    float word_thold    = 0.01f;
    float entropy_thold = 2.4f;
    float logprob_thold = -1.0f;
    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
    bool output_txt     = false;
    bool output_vtt     = false;
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
    bool no_timestamps  = false;
    std::string language = "en";
    std::string prompt;
    std::string model    = "../../ggml-large.bin";
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_outp = {};
 };
 struct whisper_print_user_data {
    const whisper_params * params;
    const std::vector<std::vector<float>> * pcmf32s;
 };
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
 void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
    const int n_segments = whisper_full_n_segments(ctx);
    std::string speaker = "";
    int64_t t0;
    int64_t t1;
    // print the last n_new segments
    const int s0 = n_segments - n_new;
    if (s0 == 0) {
        printf("\n");
    }
    for (int i = s0; i < n_segments; i++) {
        if (!params.no_timestamps || params.diarize) {
            t0 = whisper_full_get_segment_t0(ctx, i);
            t1 = whisper_full_get_segment_t1(ctx, i);
        }
        if (!params.no_timestamps) {
            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
        }
        if (params.diarize && pcmf32s.size() == 2) {
            const int64_t n_samples = pcmf32s[0].size();
            const int64_t is0 = timestamp_to_sample(t0, n_samples);
            const int64_t is1 = timestamp_to_sample(t1, n_samples);
            double energy0 = 0.0f;
            double energy1 = 0.0f;
            for (int64_t j = is0; j < is1; j++) {
                energy0 += fabs(pcmf32s[0][j]);
                energy1 += fabs(pcmf32s[1][j]);
            }
            if (energy0 > 1.1*energy1) {
                speaker = "(speaker 0)";
            } else if (energy1 > 1.1*energy0) {
                speaker = "(speaker 1)";
            } else {
                speaker = "(speaker ?)";
            }
            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
        }
        // colorful print bug
        //
        const char * text = whisper_full_get_segment_text(ctx, i);
        printf("%s%s", speaker.c_str(), text);
        // with timestamps or speakers: each segment on new line
        if (!params.no_timestamps || params.diarize) {
            printf("\n");
        }
        fflush(stdout);
    }
 }
 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        return 2;
    }
    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 3;
    }
    // initial prompt
    std::vector<whisper_token> prompt_tokens;
    if (!params.prompt.empty()) {
        prompt_tokens.resize(1024);
        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
        fprintf(stderr, "\n");
        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
        fprintf(stderr, "initial tokens: [ ");
        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
            fprintf(stderr, "%d ", prompt_tokens[i]);
        }
        fprintf(stderr, "]\n");
    }
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        const auto fname_outp = f < (int)params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
        // WAV input
        {
            drwav wav;
            std::vector<uint8_t> wav_data; // used for pipe input from stdin
            if (fname_inp == "-") {
                {
                    uint8_t buf[1024];
                    while (true)
                    {
                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
                        if (n == 0) {
                            break;
                        }
                        wav_data.insert(wav_data.end(), buf, buf + n);
                    }
                }
                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }
                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
                return 5;
            }
           if (wav.channels != 1 && wav.channels != 2) {
               fprintf(stderr, "error: WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
               return 6;
           }
           if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
               fprintf(stderr, "error: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
               return 6;
           }
           if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
               fprintf(stderr, "error: WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
               return 8;
           }
           if (wav.bitsPerSample != 16) {
               fprintf(stderr, "error: WAV file '%s' must be 16-bit\n", fname_inp.c_str());
               return 9;
           }
            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
            if (params.diarize) {
                // convert to stereo, float
                pcmf32s.resize(2);
                pcmf32s[0].resize(n);
                pcmf32s[1].resize(n);
                for (uint64_t i = 0; i < n; i++) {
                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
                }
            }
        }
        // print system information
        {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
        }
        // print some info about the processing
        {
            fprintf(stderr, "\n");
            if (!whisper_is_multilingual(ctx)) {
                if (params.language != "en" || params.translate) {
                    params.language = "en";
                    params.translate = false;
                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.no_timestamps ? 0 : 1);
            fprintf(stderr, "\n");
        }
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.print_special    = params.print_special;
            wparams.translate        = params.translate;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;
            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
            wparams.offset_ms        = params.offset_t_ms;
            wparams.duration_ms      = params.duration_ms;
            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.speed_up         = params.speed_up;
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
            whisper_print_user_data user_data = { &params, &pcmf32s };
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &user_data;
            }
            // example for abort mechanism
            // in this example, we do not abort the processing, but we could if the flag is set to true
            // the callback is called before every encoder run - if it returns false, the processing is aborted
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
                wparams.encoder_begin_callback_user_data = &is_aborted;
            }
            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "failed to process audio\n");
                return 10;
            }
        }
    }
    const int n_segments = whisper_full_n_segments(ctx);
    result.resize(n_segments);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        result[i].emplace_back(to_timestamp(t0, true));
        result[i].emplace_back(to_timestamp(t1, true));
        result[i].emplace_back(text);
    }
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
 Napi::Object whisper(const Napi::CallbackInfo& info) {
    Napi::Env env = info.Env();
    if (info.Length() <= 0 || !info[0].IsObject()) {
        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
    }
    whisper_params params;
    std::vector<std::vector<std::string>> result;
    Napi::Object whisper_params = info[0].As<Napi::Object>();
    std::string language = whisper_params.Get("language").As<Napi::String>();
    std::string model = whisper_params.Get("model").As<Napi::String>();
    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
    params.language = language;
    params.model = model;
    params.fname_inp.emplace_back(input);
    // run model
    run(params, result);
    fprintf(stderr, "RESULT:\n");
    for (auto sentence:result) {
        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
    }
    Napi::Object res = Napi::Array::New(env, result.size());
    for (uint64_t i = 0; i < result.size(); ++i) {
        Napi::Object tmp = Napi::Array::New(env, 3);
        for (uint64_t j = 0; j < 3; ++j) {
            tmp[j] = Napi::String::New(env, result[i][j]);
        }
        res[i] = tmp;
    }
    return res;
 }
 Napi::Object Init(Napi::Env env, Napi::Object exports) {
    exports.Set(
            Napi::String::New(env, "whisper"),
            Napi::Function::New(env, whisper)
    );
    return exports;
 }
 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -0,0 +1,27 @@
 const path = require('path');
 const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
 const whisperParams = {
    language: 'en',
    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
    fname_inp: '',
 };
 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
    arguments.reduce((pre, item) => {
        if (item.startsWith("--")) {
            return [...pre, item.slice(2).split("=")];
        }
        return pre;
    }, []),
 );
 for (const key in params) {
    if (whisperParams.hasOwnProperty(key)) {
        whisperParams[key] = params[key];
    }
 }
 console.log('whisperParams =', whisperParams);
 console.log(whisper(whisperParams));
--- a/examples/addon.node/package.json
+++ b/examples/addon.node/package.json
@ -0,0 +1,16 @@
 {
  "name": "whisper-addon",
  "version": "0.0.0",
  "description": "",
  "main": "index.js",
  "author": "Qanhe Chen",
  "license": "MIT",
  "scripts": {
    "test": "jest"
  },
  "devDependencies": {
    "cmake-js": "^7.1.1",
    "jest": "^29.4.0",
    "node-addon-api": "^5.0.0"
  }
 }
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@ -28,6 +28,11 @@ void bench_main(size_t index) {
        return;
    }
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
    }
    if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return;
@ -52,7 +57,7 @@ EMSCRIPTEN_BINDINGS(bench) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    if (g_worker.joinable()) {
                        g_worker.join();
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -7,6 +7,7 @@
 // command-line parameters
 struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
    std::string model = "models/ggml-base.en.bin";
 };
@ -23,6 +24,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        }
        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -41,19 +43,17 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
 }
-int main(int argc, char ** argv) {
+int whisper_bench_encoder(const whisper_params & params) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    // whisper init
-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    {
        fprintf(stderr, "\n");
@ -92,3 +92,22 @@ int main(int argc, char ** argv) {
    return 0;
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    int ret = -1;
    switch (params.what) {
        case 0: ret = whisper_bench_encoder(params);                break;
        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
    }
    return ret;
 }
--- a/examples/chess/CMakeLists.txt
+++ b/examples/chess/CMakeLists.txt
@ -0,0 +1,10 @@
 if (WHISPER_SUPPORT_SDL2)
    # chess
    set(TARGET chess)
    add_executable(${TARGET} chess.cpp)
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/chess/chess.cpp
+++ b/examples/chess/chess.cpp
@ -0,0 +1,634 @@
 // Input chess moves via voice
 //
 #include "common.h"
 #include "whisper.h"
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
 #include <mutex>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t) {
    int64_t sec = t/100;
    int64_t msec = t - sec*100;
    int64_t min = sec/60;
    sec = sec - min*60;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
    return std::string(buf);
 }
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
    int32_t keep_ms    = 200;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool translate     = false;
    bool print_special = false;
    bool no_context    = true;
    bool no_timestamps = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::string fname_inp;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
        else if (                 arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_inp     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
    fprintf(stderr, "            --keep N        [%-7d] audio to keep from previous step in ms\n",      params.keep_ms);
    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                         params.fname_inp.c_str());
    fprintf(stderr, "\n");
 }
 //
 // SDL Audio capture
 //
 class audio_async {
 public:
    audio_async(int len_ms);
    ~audio_async();
    bool init(int capture_id, int sample_rate);
    // start capturing audio via the provided SDL callback
    // keep last len_ms seconds of audio in a circular buffer
    bool resume();
    bool pause();
    bool clear();
    // callback to be called by SDL
    void callback(uint8_t * stream, int len);
    // get audio data from the circular buffer
    void get(int ms, std::vector<float> & audio);
 private:
    SDL_AudioDeviceID m_dev_id_in = 0;
    int m_len_ms = 0;
    int m_sample_rate = 0;
    std::atomic_bool m_running;
    std::mutex       m_mutex;
    std::vector<float> m_audio;
    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
 audio_async::audio_async(int len_ms) {
    m_len_ms = len_ms;
    m_running = false;
 }
 audio_async::~audio_async() {
    if (m_dev_id_in) {
        SDL_CloseAudioDevice(m_dev_id_in);
    }
 }
 bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
        return false;
    }
    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
    {
        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
        for (int i = 0; i < nDevices; i++) {
            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }
    SDL_AudioSpec capture_spec_requested;
    SDL_AudioSpec capture_spec_obtained;
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);
    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
        audio_async * audio = (audio_async *) userdata;
        audio->callback(stream, len);
    };
    capture_spec_requested.userdata = this;
    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
        m_dev_id_in = 0;
        return false;
    } else {
        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
                capture_spec_requested.format);
        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
                capture_spec_requested.channels);
        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
    }
    m_sample_rate = capture_spec_obtained.freq;
    m_audio.resize((m_sample_rate*m_len_ms)/1000);
    return true;
 }
 bool audio_async::resume() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
        return false;
    }
    if (m_running) {
        fprintf(stderr, "%s: already running!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 0);
    m_running = true;
    return true;
 }
 bool audio_async::pause() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: already paused!\n", __func__);
        return false;
    }
    SDL_PauseAudioDevice(m_dev_id_in, 1);
    m_running = false;
    return true;
 }
 bool audio_async::clear() {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
        return false;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return false;
    }
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        m_audio_pos = 0;
        m_audio_len = 0;
    }
    return true;
 }
 // callback to be called by SDL
 void audio_async::callback(uint8_t * stream, int len) {
    if (!m_running) {
        return;
    }
    const size_t n_samples = len / sizeof(float);
    m_audio_new.resize(n_samples);
    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (m_audio_pos + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - m_audio_pos;
            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
        } else {
            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
        }
    }
 }
 void audio_async::get(int ms, std::vector<float> & result) {
    if (!m_dev_id_in) {
        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
        return;
    }
    if (!m_running) {
        fprintf(stderr, "%s: not running!\n", __func__);
        return;
    }
    result.clear();
    {
        std::lock_guard<std::mutex> lock(m_mutex);
        if (ms <= 0) {
            ms = m_len_ms;
        }
        size_t n_samples = (m_sample_rate * ms) / 1000;
        if (n_samples > m_audio_len) {
            n_samples = m_audio_len;
        }
        result.resize(n_samples);
        int s0 = m_audio_pos - n_samples;
        if (s0 < 0) {
            s0 += m_audio.size();
        }
        if (s0 + n_samples > m_audio.size()) {
            const size_t n0 = m_audio.size() - s0;
            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
        } else {
            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
        }
    }
 }
 ///////////////////////////
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
    params.length_ms = std::max(params.length_ms, params.step_ms);
    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
    params.no_timestamps  = !use_vad;
    params.no_context    |= use_vad;
    params.max_tokens     = 0;
    // init audio
    audio_async audio(params.length_ms);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }
    audio.resume();
    // whisper init
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
    std::vector<whisper_token> prompt_tokens;
    // print some info about the processing
    {
        fprintf(stderr, "\n");
        if (!whisper_is_multilingual(ctx)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
                n_samples_step,
                float(n_samples_step)/WHISPER_SAMPLE_RATE,
                float(n_samples_len )/WHISPER_SAMPLE_RATE,
                float(n_samples_keep)/WHISPER_SAMPLE_RATE,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);
        if (!use_vad) {
            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
        } else {
            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
        }
        fprintf(stderr, "\n");
    }
    int n_iter = 0;
    bool is_running = true;
    printf("[Start speaking]");
    fflush(stdout);
          auto t_last  = std::chrono::high_resolution_clock::now();
    const auto t_start = t_last;
    // main audio loop
    while (is_running) {
        // handle Ctrl + C
        {
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
                switch (event.type) {
                    case SDL_QUIT:
                        {
                            is_running = false;
                        } break;
                    default:
                        break;
                }
            }
            if (!is_running) {
                break;
            }
        }
        if (!is_running) {
            break;
        }
        // process new audio
        if (!use_vad) {
            while (true) {
                audio.get(params.step_ms, pcmf32_new);
                if ((int) pcmf32_new.size() > 2*n_samples_step) {
                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
                    audio.clear();
                    continue;
                }
                if ((int) pcmf32_new.size() >= n_samples_step) {
                    audio.clear();
                    break;
                }
                SDL_Delay(1);
            }
            const int n_samples_new = pcmf32_new.size();
            // take up to params.length_ms audio from previous iteration
            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
            pcmf32.resize(n_samples_new + n_samples_take);
            for (int i = 0; i < n_samples_take; i++) {
                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
            }
            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
            pcmf32_old = pcmf32;
        } else {
            const auto t_now  = std::chrono::high_resolution_clock::now();
            const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
            if (t_diff < 2000) {
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
                continue;
            }
            audio.get(2000, pcmf32_new);
            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
                audio.get(params.length_ms, pcmf32);
            } else {
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
                continue;
            }
            t_last = t_now;
        }
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.print_progress   = false;
            wparams.print_special    = params.print_special;
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;
            wparams.audio_ctx        = params.audio_ctx;
            // disable temperature fallback
            wparams.temperature_inc  = -1.0f;
            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                return 6;
            }
            // print result;
            {
                if (!use_vad) {
                    printf("\33[2K\r");
                    // print long empty line to clear the previous line
                    printf("%s", std::string(100, ' ').c_str());
                    printf("\33[2K\r");
                } else {
                    const int64_t t1 = (t_last - t_start).count()/1000000;
                    const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
                    printf("\n");
                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
                    printf("\n");
                }
                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = 0; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
                    if (params.no_timestamps) {
                        printf("%s", text);
                        fflush(stdout);
                    } else {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
                    }
                }
                if (use_vad){
                    printf("\n");
                    printf("### Transcription %d END\n", n_iter);
                }
            }
            ++n_iter;
            if (!use_vad && (n_iter % n_new_line) == 0) {
                printf("\n");
                // keep part of the audio for next iteration to try to mitigate word boundary issues
                pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
                // Add tokens of the last full length segment as the prompt
                if (!params.no_context) {
                    prompt_tokens.clear();
                    const int n_segments = whisper_full_n_segments(ctx);
                    for (int i = 0; i < n_segments; ++i) {
                        const int token_count = whisper_full_n_tokens(ctx, i);
                        for (int j = 0; j < token_count; ++j) {
                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
                        }
                    }
                }
            }
        }
    }
    audio.pause();
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -324,7 +324,7 @@ EMSCRIPTEN_BINDINGS(command) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -11,6 +11,7 @@
 #include <SDL.h>
 #include <SDL_audio.h>
 #include <sstream>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
@ -25,7 +26,7 @@
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t prompt_ms  = 5000;
-    int32_t command_ms = 4000;
+    int32_t command_ms = 8000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
@ -43,6 +44,7 @@ struct whisper_params {
    std::string model     = "models/ggml-base.en.bin";
    std::string fname_out;
    std::string commands;
    std::string prompt;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -71,6 +73,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-cmd" || arg == "--commands")      { params.commands      = argv[++i]; }
        else if (arg == "-p"   || arg == "--prompt")        { params.prompt        = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -103,6 +106,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "  -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n",             params.commands.c_str());
    fprintf(stderr, "  -p,         --prompt         [%-7s] the required activation prompt\n",              params.prompt.c_str());
    fprintf(stderr, "\n");
 }
@ -510,331 +514,431 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
    return allowed_commands;
 }
 std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;
    std::istringstream iss(txt);
    std::string word;
    while (iss >> word) {
        words.push_back(word);
    }
    return words;
 }
 // returns true if no exit event was received
 bool process_sdl_events() {
    SDL_Event event;
    while (SDL_PollEvent(&event)) {
        switch (event.type) {
            case SDL_QUIT:
                {
                    return false;
                } break;
            default:
                break;
        }
    }
    return true;
 }
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
-   fprintf(stderr, "\n");
+    fprintf(stderr, "\n");
-   fprintf(stderr, "%s: guided mode\n", __func__);
+    fprintf(stderr, "%s: guided mode\n", __func__);
-   std::vector<std::string> allowed_commands = read_allowed_commands(params.commands);
+    std::vector<std::string> allowed_commands = read_allowed_commands(params.commands);
-   if (allowed_commands.empty()) {
+    if (allowed_commands.empty()) {
-      fprintf(stderr, "%s: error: failed to read allowed commands from '%s'\n", __func__, params.commands.c_str());
+        fprintf(stderr, "%s: error: failed to read allowed commands from '%s'\n", __func__, params.commands.c_str());
-      return 2;
+        return 2;
-   }
+    }
-   int max_len = 0;
+    int max_len = 0;
-   std::vector<std::vector<whisper_token>> allowed_tokens;
+    std::vector<std::vector<whisper_token>> allowed_tokens;
-   for (const auto & cmd : allowed_commands) {
+    for (const auto & cmd : allowed_commands) {
-      whisper_token tokens[1024];
+        whisper_token tokens[1024];
-      allowed_tokens.emplace_back();
+        allowed_tokens.emplace_back();
-      for (int l = 0; l < (int) cmd.size(); ++l) {
+        for (int l = 0; l < (int) cmd.size(); ++l) {
-         // NOTE: very important to add the whitespace !
+            // NOTE: very important to add the whitespace !
-         //       the reason is that the first decoded token starts with a whitespace too!
+            //       the reason is that the first decoded token starts with a whitespace too!
-         std::string ss = std::string(" ") + cmd.substr(0, l + 1);
+            std::string ss = std::string(" ") + cmd.substr(0, l + 1);
-         const int n = whisper_tokenize(ctx, ss.c_str(), tokens, 1024);
+            const int n = whisper_tokenize(ctx, ss.c_str(), tokens, 1024);
-         if (n < 0) {
+            if (n < 0) {
-            fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, cmd.c_str());
+                fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, cmd.c_str());
-            return 3;
+                return 3;
         }
         if (n == 1) {
            allowed_tokens.back().push_back(tokens[0]);
         }
      }
      max_len = std::max(max_len, (int) cmd.size());
   }
   fprintf(stderr, "%s: allowed commands [ tokens ]:\n", __func__);
   fprintf(stderr, "\n");
   for (int i = 0; i < (int) allowed_commands.size(); ++i) {
      fprintf(stderr, "  - \033[1m%-*s\033[0m = [", max_len, allowed_commands[i].c_str());
      for (const auto & token : allowed_tokens[i]) {
         fprintf(stderr, " %5d", token);
      }
      fprintf(stderr, " ]\n");
   }
   std::string  k_prompt = "select one from the available words: ";
   for (int i = 0; i < (int) allowed_commands.size(); ++i) {
      if (i > 0) {
         k_prompt += ", ";
      }
      k_prompt += allowed_commands[i];
   }
   k_prompt += ". selected word: ";
   // tokenize prompt
   std::vector<whisper_token> k_tokens;
   {
      k_tokens.resize(1024);
      const int n = whisper_tokenize(ctx, k_prompt.c_str(), k_tokens.data(), 1024);
      if (n < 0) {
         fprintf(stderr, "%s: error: failed to tokenize prompt '%s'\n", __func__, k_prompt.c_str());
         return 4;
      }
      k_tokens.resize(n);
   }
   fprintf(stderr, "\n");
   fprintf(stderr, "%s: prompt: '%s'\n", __func__, k_prompt.c_str());
   fprintf(stderr, "%s: tokens: [", __func__);
   for (const auto & token : k_tokens) {
      fprintf(stderr, " %d", token);
   }
   fprintf(stderr, " ]\n");
   fprintf(stderr, "\n");
   fprintf(stderr, "%s: listening for a command ...\n", __func__);
   fprintf(stderr, "\n");
   bool is_running  = true;
   std::vector<float> pcmf32_cur;
   std::vector<float> pcmf32_prompt;
   // main loop
   while (is_running) {
      // handle Ctrl + C
      {
         SDL_Event event;
         while (SDL_PollEvent(&event)) {
            switch (event.type) {
               case SDL_QUIT:
               {
                  is_running = false;
               } break;
               default:
                  break;
            }
         }
-         if (!is_running) {
+            if (n == 1) {
-            return 0;
+                allowed_tokens.back().push_back(tokens[0]);
         }
      }
      // delay
      std::this_thread::sleep_for(std::chrono::milliseconds(100));
      audio.get(2000, pcmf32_cur);
      if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
         fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
         const auto t_start = std::chrono::high_resolution_clock::now();
         whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
         wparams.print_progress   = false;
         wparams.print_special    = params.print_special;
         wparams.print_realtime   = false;
         wparams.print_timestamps = !params.no_timestamps;
         wparams.translate        = params.translate;
         wparams.no_context       = true;
         wparams.single_segment   = true;
         wparams.max_tokens       = 1;
         wparams.language         = params.language.c_str();
         wparams.n_threads        = params.n_threads;
         wparams.audio_ctx        = params.audio_ctx;
         wparams.speed_up         = params.speed_up;
         wparams.prompt_tokens    = k_tokens.data();
         wparams.prompt_n_tokens  = k_tokens.size();
         // run the transformer and a single decoding pass
         if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
            fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
            break;
         }
         const auto * probs = whisper_get_probs(ctx);
         std::vector<std::pair<float, int>> probs_id;
         double psum = 0.0;
         for (int i = 0; i < (int) allowed_commands.size(); ++i) {
            probs_id.emplace_back(probs[allowed_tokens[i][0]], i);
            for (int j = 1; j < (int) allowed_tokens[i].size(); ++j) {
               probs_id.back().first += probs[allowed_tokens[i][j]];
            }
-            probs_id.back().first /= allowed_tokens[i].size();
+        }
            psum += probs_id.back().first;
         }
-         // normalize
+        max_len = std::max(max_len, (int) cmd.size());
-         for (auto & p : probs_id) {
+    }
            p.first /= psum;
         }
-         // sort descending
+    fprintf(stderr, "%s: allowed commands [ tokens ]:\n", __func__);
-         {
+    fprintf(stderr, "\n");
-            using pair_type = decltype(probs_id)::value_type;
+    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
-            std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
+        fprintf(stderr, "  - \033[1m%-*s\033[0m = [", max_len, allowed_commands[i].c_str());
-               return a.first > b.first;
+        for (const auto & token : allowed_tokens[i]) {
-            });
+            fprintf(stderr, " %5d", token);
-         }
+        }
        fprintf(stderr, " ]\n");
    }
-         // print the commands and the respective probabilities
+    std::string  k_prompt = "select one from the available words: ";
-         {
+    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
-            fprintf(stdout, "\n");
+        if (i > 0) {
-            for (const auto & cmd : probs_id) {
+            k_prompt += ", ";
-               fprintf(stdout, "%s: %s%-*s%s = %f | ", __func__, "\033[1m", max_len, allowed_commands[cmd.second].c_str(), "\033[0m", cmd.first);
+        }
-               for (int token : allowed_tokens[cmd.second]) {
+        k_prompt += allowed_commands[i];
-                  fprintf(stdout, "'%4s' %f ", whisper_token_to_str(ctx, token), probs[token]);
+    }
-               }
+    k_prompt += ". selected word: ";
-               fprintf(stdout, "\n");
+
    // tokenize prompt
    std::vector<whisper_token> k_tokens;
    {
        k_tokens.resize(1024);
        const int n = whisper_tokenize(ctx, k_prompt.c_str(), k_tokens.data(), 1024);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to tokenize prompt '%s'\n", __func__, k_prompt.c_str());
            return 4;
        }
        k_tokens.resize(n);
    }
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, k_prompt.c_str());
    fprintf(stderr, "%s: tokens: [", __func__);
    for (const auto & token : k_tokens) {
        fprintf(stderr, " %d", token);
    }
    fprintf(stderr, " ]\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: listening for a command ...\n", __func__);
    fprintf(stderr, "\n");
    bool is_running  = true;
    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;
    // main loop
    while (is_running) {
        // handle Ctrl + C
        is_running = process_sdl_events();
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        audio.get(2000, pcmf32_cur);
        if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
            fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
            const auto t_start = std::chrono::high_resolution_clock::now();
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.print_progress   = false;
            wparams.print_special    = params.print_special;
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
            wparams.no_context       = true;
            wparams.single_segment   = true;
            wparams.max_tokens       = 1;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.prompt_tokens    = k_tokens.data();
            wparams.prompt_n_tokens  = k_tokens.size();
            // run the transformer and a single decoding pass
            if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
                fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
                break;
            }
         }
-         // best command
+            // estimate command probability
-         {
+            // NOTE: not optimal
-            const auto t_end = std::chrono::high_resolution_clock::now();
+            {
                const auto * logits = whisper_get_logits(ctx);
-            const float prob = probs_id[0].first;
+                std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
            const int index = probs_id[0].second;
                // compute probs from logits via softmax
                {
                    float max = -1e9;
                    for (int i = 0; i < (int) probs.size(); ++i) {
                        max = std::max(max, logits[i]);
                    }
                    float sum = 0.0f;
                    for (int i = 0; i < (int) probs.size(); ++i) {
                        probs[i] = expf(logits[i] - max);
                        sum += probs[i];
                    }
                    for (int i = 0; i < (int) probs.size(); ++i) {
                        probs[i] /= sum;
                    }
                }
                std::vector<std::pair<float, int>> probs_id;
                double psum = 0.0;
                for (int i = 0; i < (int) allowed_commands.size(); ++i) {
                    probs_id.emplace_back(probs[allowed_tokens[i][0]], i);
                    for (int j = 1; j < (int) allowed_tokens[i].size(); ++j) {
                        probs_id.back().first += probs[allowed_tokens[i][j]];
                    }
                    probs_id.back().first /= allowed_tokens[i].size();
                    psum += probs_id.back().first;
                }
                // normalize
                for (auto & p : probs_id) {
                    p.first /= psum;
                }
                // sort descending
                {
                    using pair_type = decltype(probs_id)::value_type;
                    std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
                        return a.first > b.first;
                    });
                }
                // print the commands and the respective probabilities
                {
                    fprintf(stdout, "\n");
                    for (const auto & cmd : probs_id) {
                        fprintf(stdout, "%s: %s%-*s%s = %f | ", __func__, "\033[1m", max_len, allowed_commands[cmd.second].c_str(), "\033[0m", cmd.first);
                        for (int token : allowed_tokens[cmd.second]) {
                            fprintf(stdout, "'%4s' %f ", whisper_token_to_str(ctx, token), probs[token]);
                        }
                        fprintf(stdout, "\n");
                    }
                }
                // best command
                {
                    const auto t_end = std::chrono::high_resolution_clock::now();
                    const float prob = probs_id[0].first;
                    const int index = probs_id[0].second;
                    fprintf(stdout, "\n");
                    fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
                            "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
                    fprintf(stdout, "\n");
                }
            }
            audio.clear();
        }
    }
    return 0;
 }
 // always-prompt mode
 // transcribe the voice into text after valid prompt
 int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;
    float prob = 0.0f;
    std::vector<float> pcmf32_cur;
    const std::string k_prompt = params.prompt;
    const int k_prompt_length = get_words(k_prompt).size();
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: always-prompt mode\n", __func__);
    // main loop
    while (is_running) {
        // handle Ctrl + C
        is_running = process_sdl_events();
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        if (ask_prompt) {
            fprintf(stdout, "\n");
-            fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
+            fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
                    "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
                  (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
            fprintf(stdout, "\n");
         }
-         audio.clear();
+            ask_prompt = false;
-      }
+        }
   }
-   return 0;
+        {
            audio.get(2000, pcmf32_cur);
            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                int64_t t_ms = 0;
                // detect the commands
                audio.get(params.command_ms, pcmf32_cur);
                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
                const auto words = get_words(txt);
                std::string prompt;
                std::string command;
                for (int i = 0; i < (int) words.size(); ++i) {
                    if (i < k_prompt_length) {
                        prompt += words[i] + " ";
                    } else {
                        command += words[i] + " ";
                    }
                }
                const float sim = similarity(prompt, k_prompt);
                //debug
                //fprintf(stdout, "command size: %i\n", command_length);
                if ((sim > 0.7f) && (command.size() > 0)) {
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                }
                fprintf(stdout, "\n");
                audio.clear();
            }
        }
    }
    return 0;
 }
 // general-purpose mode
 // freely transcribe the voice into text
 int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
-   bool is_running  = true;
+    bool is_running  = true;
-   bool have_prompt = false;
+    bool have_prompt = false;
-   bool ask_prompt  = true;
+    bool ask_prompt  = true;
-   float prob0 = 0.0f;
+    float prob0 = 0.0f;
-   float prob  = 0.0f;
+    float prob  = 0.0f;
-   std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_cur;
-   std::vector<float> pcmf32_prompt;
+    std::vector<float> pcmf32_prompt;
-   const std::string k_prompt = "Ok Whisper, start listening for commands.";
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";
-   fprintf(stderr, "\n");
+    fprintf(stderr, "\n");
-   fprintf(stderr, "%s: general-purpose mode\n", __func__);
+    fprintf(stderr, "%s: general-purpose mode\n", __func__);
-   // main loop
+    // main loop
-   while (is_running) {
+    while (is_running) {
-      // handle Ctrl + C
+        // handle Ctrl + C
-      {
+        is_running = process_sdl_events();
-         SDL_Event event;
+
-         while (SDL_PollEvent(&event)) {
+        // delay
-            switch (event.type) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-               case SDL_QUIT:
+
-               {
+        if (ask_prompt) {
-                  is_running = false;
+            fprintf(stdout, "\n");
-               } break;
+            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
-               default:
+            fprintf(stdout, "\n");
-                  break;
+
            ask_prompt = false;
        }
        {
            audio.get(2000, pcmf32_cur);
            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                int64_t t_ms = 0;
                if (!have_prompt) {
                    // wait for activation phrase
                    audio.get(params.prompt_ms, pcmf32_cur);
                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
                    const float sim = similarity(txt, k_prompt);
                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
                        ask_prompt = true;
                    } else {
                        fprintf(stdout, "\n");
                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
                        fprintf(stdout, "\n");
                        // save the audio for the prompt
                        pcmf32_prompt = pcmf32_cur;
                        have_prompt = true;
                    }
                } else {
                    // we have heard the activation phrase, now detect the commands
                    audio.get(params.command_ms, pcmf32_cur);
                    // prepend the prompt audio
                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
                    prob = 100.0f*(prob - prob0);
                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
                    // find the prompt in the text
                    float best_sim = 0.0f;
                    size_t best_len = 0;
                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
                        const auto prompt = txt.substr(0, n);
                        const float sim = similarity(prompt, k_prompt);
                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
                        if (sim > best_sim) {
                            best_sim = sim;
                            best_len = n;
                        }
                    }
                    const std::string command = ::trim(txt.substr(best_len));
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                    fprintf(stdout, "\n");
                }
                audio.clear();
            }
-         }
+        }
    }
-         if (!is_running) {
+    return 0;
            return 0;
         }
      }
      // delay
      std::this_thread::sleep_for(std::chrono::milliseconds(100));
      if (ask_prompt) {
         fprintf(stdout, "\n");
         fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
         fprintf(stdout, "\n");
         ask_prompt = false;
      }
      {
         audio.get(2000, pcmf32_cur);
         if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
            fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
            int64_t t_ms = 0;
            if (!have_prompt) {
               // wait for activation phrase
               audio.get(params.prompt_ms, pcmf32_cur);
               const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
               fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
               const float sim = similarity(txt, k_prompt);
               if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
                  fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
                  ask_prompt = true;
               } else {
                  fprintf(stdout, "\n");
                  fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
                  fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
                  fprintf(stdout, "\n");
                  // save the audio for the prompt
                  pcmf32_prompt = pcmf32_cur;
                  have_prompt = true;
               }
            } else {
               // we have heard the activation phrase, now detect the commands
               audio.get(params.command_ms, pcmf32_cur);
               // prepend the prompt audio
               pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
               const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
               prob = 100.0f*(prob - prob0);
               //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
               // find the prompt in the text
               float best_sim = 0.0f;
               size_t best_len = 0;
               for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
                  const auto prompt = txt.substr(0, n);
                  const float sim = similarity(prompt, k_prompt);
                  //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
                  if (sim > best_sim) {
                     best_sim = sim;
                     best_len = n;
                  }
               }
               const std::string command = ::trim(txt.substr(best_len));
               fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
               fprintf(stdout, "\n");
            }
            audio.clear();
         }
      }
   }
   return 0;
 }
 int main(int argc, char ** argv) {
@ -852,7 +956,7 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    // print some info about the processing
    {
@ -891,9 +995,11 @@ int main(int argc, char ** argv) {
    int  ret_val = 0;
    if (!params.commands.empty()) {
-       ret_val = process_command_list(ctx, audio, params);
+        ret_val = process_command_list(ctx, audio, params);
    } else if (!params.prompt.empty()) {
        ret_val = always_prompt_transcription(ctx, audio, params);
    } else {
-       ret_val = process_general_transcription(ctx, audio, params);
+        ret_val = process_general_transcription(ctx, audio, params);
    }
    audio.pause();
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -8,7 +8,7 @@ function convertTypedArray(src, type) {
 var printTextarea = (function() {
    var element = document.getElementById('output');
-    if (element) element.alue = ''; // clear browser cache
+    if (element) element.value = ''; // clear browser cache
    return function(text) {
        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
        console.log(text);
@ -88,11 +88,15 @@ async function fetchRemote(url, cbProgress, cbPrint) {
 // - check if the data is already in the IndexedDB
 // - if not, fetch it from the remote URL and store it in the IndexedDB
 function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
-    // query the storage quota and print it
+    if (!navigator.storage || !navigator.storage.estimate) {
-    navigator.storage.estimate().then(function (estimate) {
+        cbPrint('loadRemote: navigator.storage.estimate() is not supported');
-        cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
+    } else {
-        cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
+        // query the storage quota and print it
-    });
+        navigator.storage.estimate().then(function (estimate) {
            cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
            cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
        });
    }
    // check if the data is already in the IndexedDB
    var rq = indexedDB.open(dbName, dbVersion);
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -100,7 +100,7 @@ while [ $running -eq 1 ]; do
        err=$(cat /tmp/whisper-live.err | wc -l)
    done
-    ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
+    ./main -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
    while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
        sleep 1
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -9,25 +9,35 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 usage: ./main [options] file0.wav file1.wav ...
 options:
-  -h,       --help          [default] show this help message and exit
+  -h,        --help              [default] show this help message and exit
-  -t N,     --threads N     [4      ] number of threads to use during computation
+  -t N,      --threads N         [4      ] number of threads to use during computation
-  -p N,     --processors N  [1      ] number of processors to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
-  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
-  -on N,    --offset-n N    [0      ] segment index offset
+  -on N,     --offset-n N        [0      ] segment index offset
-  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
-  -tr,      --translate     [false  ] translate from source language to english
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
-  -otxt,    --output-txt    [false  ] output result in a text file
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
-  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -osrt,    --output-srt    [false  ] output result in a srt file
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
-  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -tr,       --translate         [false  ] translate from source language to english
-  -ps,      --print-special [false  ] print special tokens
+  -di,       --diarize           [false  ] stereo audio diarization
-  -pc,      --print-colors  [false  ] print colors
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
-  -nt,      --no-timestamps [true   ] do not print timestamps
+  -otxt,     --output-txt        [false  ] output result in a text file
-  -l LANG,  --language LANG [en     ] spoken language
+  -ovtt,     --output-vtt        [false  ] output result in a vtt file
-  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -osrt,     --output-srt        [false  ] output result in a srt file
-  -f FNAME, --file FNAME    [       ] input WAV file path
+  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
  -pp,       --print-progress    [false  ] print progress
  -nt,       --no-timestamps     [true   ] do not print timestamps
  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -53,18 +53,24 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors = 1;
+    int32_t n_processors =  1;
-    int32_t offset_t_ms  = 0;
+    int32_t offset_t_ms  =  0;
-    int32_t offset_n     = 0;
+    int32_t offset_n     =  0;
-    int32_t duration_ms  = 0;
+    int32_t duration_ms  =  0;
    int32_t max_context  = -1;
-    int32_t max_len      = 0;
+    int32_t max_len      =  0;
    int32_t best_of      =  5;
    int32_t beam_size    = -1;
-    float word_thold = 0.01f;
+    float word_thold    =  0.01f;
    float entropy_thold =  2.40f;
    float logprob_thold = -1.00f;
    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
    bool split_on_word  = false;
    bool no_fallback    = false;
    bool output_txt     = false;
    bool output_vtt     = false;
    bool output_srt     = false;
@ -80,6 +86,7 @@ struct whisper_params {
    std::string model    = "models/ggml-base.en.bin";
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_outp = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -104,15 +111,22 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-d"    || arg == "--duration")       { params.duration_ms    = std::stoi(argv[++i]); }
        else if (arg == "-mc"   || arg == "--max-context")    { params.max_context    = std::stoi(argv[++i]); }
        else if (arg == "-ml"   || arg == "--max-len")        { params.max_len        = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")        { params.best_of        = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")      { params.beam_size      = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")     { params.word_thold     = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")  { params.entropy_thold  = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")  { params.logprob_thold  = std::stof(argv[++i]); }
        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
        else if (arg == "-sow"  || arg == "--split-on-word")  { params.split_on_word  = true; }
        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_outp.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
@ -136,31 +150,38 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,     --processors N   [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,    --offset-t N     [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,    --offset-n N     [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,    --duration N     [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,    --max-context N  [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,    --max-len N      [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -wt N,    --word-thold N   [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -di,      --diarize        [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -otxt,    --output-txt     [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -ovtt,    --output-vtt     [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    fprintf(stderr, "  -osrt,    --output-srt     [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -owts,    --output-words   [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -ocsv,    --output-csv     [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -pc,      --print-colors   [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -pp,      --print-progress [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -nt,      --no-timestamps  [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "            --prompt PROMPT  [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -m FNAME, --model FNAME    [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] input WAV file path\n",                            "");
+    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }
@ -235,7 +256,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
                const char * text = whisper_full_get_token_text(ctx, i, j);
                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
            }
@ -331,20 +352,16 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
 	if (text[0] == ' ')
 	  text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-	//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
+
-        fout << 10 * t0 << ", " 
+        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-	     << 10 * t1 << ", \"" 
+        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
 	     << text    << "\"\n";
    }
    return true;
 }
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
@ -478,7 +495,7 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
@ -503,6 +520,7 @@ int main(int argc, char ** argv) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
 		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -620,6 +638,8 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
            wparams.print_timestamps = !params.no_timestamps;
@ -634,11 +654,19 @@ int main(int argc, char ** argv) {
            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
            wparams.speed_up         = params.speed_up;
-            wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;
            whisper_print_user_data user_data = { &params, &pcmf32s };
@ -673,31 +701,31 @@ int main(int argc, char ** argv) {
            // output to text file
            if (params.output_txt) {
-                const auto fname_txt = fname_inp + ".txt";
+                const auto fname_txt = fname_outp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }
            // output to VTT file
            if (params.output_vtt) {
-                const auto fname_vtt = fname_inp + ".vtt";
+                const auto fname_vtt = fname_outp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }
            // output to SRT file
            if (params.output_srt) {
-                const auto fname_srt = fname_inp + ".srt";
+                const auto fname_srt = fname_outp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }
            // output to WTS file
            if (params.output_wts) {
-                const auto fname_wts = fname_inp + ".wts";
+                const auto fname_wts = fname_outp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
 	    // output to CSV file
            if (params.output_csv) {
-                const auto fname_csv = fname_inp + ".csv";
+                const auto fname_csv = fname_outp + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -49,6 +49,9 @@ void stream_main(size_t index) {
    wparams.max_tokens       = 32;
    wparams.audio_ctx        = 768; // partial encoder context for better performance
    // disable temperature fallback
    wparams.temperature_inc  = -1.0f;
    wparams.language         = "en";
    printf("stream: using %d threads\n", wparams.n_threads);
@ -129,7 +132,7 @@ EMSCRIPTEN_BINDINGS(stream) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -423,7 +423,8 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms
+    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
    params.length_ms = std::max(params.length_ms, params.step_ms);
    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
@ -432,11 +433,11 @@ int main(int argc, char ** argv) {
    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
-    const int n_new_line = !use_vad ? params.length_ms / params.step_ms - 1 : 1; // number of steps to print new line
+    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
-    params.no_timestamps = !use_vad;
+    params.no_timestamps  = !use_vad;
-    params.no_context    = use_vad;
+    params.no_context    |= use_vad;
-    params.max_tokens    = 0;
+    params.max_tokens     = 0;
    // init audio
@ -456,10 +457,10 @@ int main(int argc, char ** argv) {
        exit(0);
    }
-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
-    std::vector<float> pcmf32_old(n_samples_30s, 0.0f);
+    std::vector<float> pcmf32_old;
    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
    std::vector<whisper_token> prompt_tokens;
@ -486,7 +487,7 @@ int main(int argc, char ** argv) {
                params.no_timestamps ? 0 : 1);
        if (!use_vad) {
-            fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
+            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
        } else {
            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
        }
@ -615,6 +616,9 @@ int main(int argc, char ** argv) {
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            // disable temperature fallback
            wparams.temperature_inc  = -1.0f;
            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -498,7 +498,7 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context * ctx_wsp = whisper_init(params.model_wsp.c_str());
+    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
    // gpt init
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -64,16 +64,21 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    private suspend fun copyAssets() = withContext(Dispatchers.IO) {
        modelsPath.mkdirs()
        samplesPath.mkdirs()
-        application.copyData("models", modelsPath, ::printMessage)
+        //application.copyData("models", modelsPath, ::printMessage)
        application.copyData("samples", samplesPath, ::printMessage)
        printMessage("All data copied to working directory.\n")
    }
    private suspend fun loadBaseModel() = withContext(Dispatchers.IO) {
        printMessage("Loading model...\n")
-        val firstModel = modelsPath.listFiles()!!.first()
+        val models = application.assets.list("models/")
-        whisperContext = WhisperContext.createContext(firstModel.absolutePath)
+        if (models != null) {
-        printMessage("Loaded model ${firstModel.name}.\n")
+            whisperContext = WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
            printMessage("Loaded model ${models[0]}.\n")
        }
        //val firstModel = modelsPath.listFiles()!!.first()
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }
    fun transcribeSample() = viewModelScope.launch {
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -1,9 +1,11 @@
 package com.whispercppdemo.whisper
 import android.content.res.AssetManager
 import android.os.Build
 import android.util.Log
 import kotlinx.coroutines.*
 import java.io.File
 import java.io.InputStream
 import java.util.concurrent.Executors
 private const val LOG_TAG = "LibWhisper"
@ -39,13 +41,31 @@ class WhisperContext private constructor(private var ptr: Long) {
    }
    companion object {
-        fun createContext(filePath: String): WhisperContext {
+        fun createContextFromFile(filePath: String): WhisperContext {
            val ptr = WhisperLib.initContext(filePath)
            if (ptr == 0L) {
                throw java.lang.RuntimeException("Couldn't create context with path $filePath")
            }
            return WhisperContext(ptr)
        }
        fun createContextFromInputStream(stream: InputStream): WhisperContext {
            val ptr = WhisperLib.initContextFromInputStream(stream)
            if (ptr == 0L) {
                throw java.lang.RuntimeException("Couldn't create context from input stream")
            }
            return WhisperContext(ptr)
        }
        fun createContextFromAsset(assetManager: AssetManager, assetPath: String): WhisperContext {
            val ptr = WhisperLib.initContextFromAsset(assetManager, assetPath)
            if (ptr == 0L) {
                throw java.lang.RuntimeException("Couldn't create context from asset $assetPath")
            }
            return WhisperContext(ptr)
        }
    }
 }
@ -76,6 +96,8 @@ private class WhisperLib {
        }
        // JNI methods
        external fun initContextFromInputStream(inputStream: InputStream): Long
        external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
        external fun initContext(modelPath: String): Long
        external fun freeContext(contextPtr: Long)
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
--- a/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
@ -1,5 +1,5 @@
 WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
-LOCAL_LDLIBS    := -llog
+LOCAL_LDLIBS    := -landroid -llog
 # Make the final output library smaller by only keeping the symbols referenced from the app.
 ifneq ($(APP_OPTIM),debug)
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -1,13 +1,17 @@
 #include <jni.h>
 #include <android/asset_manager.h>
 #include <android/asset_manager_jni.h>
 #include <android/log.h>
 #include <stdlib.h>
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
 #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,     TAG, __VA_ARGS__)
 #define LOGW(...) __android_log_print(ANDROID_LOG_WARN,     TAG, __VA_ARGS__)
 static inline int min(int a, int b) {
    return (a < b) ? a : b;
@ -17,13 +21,132 @@ static inline int max(int a, int b) {
    return (a > b) ? a : b;
 }
 struct input_stream_context {
    size_t offset;
    JNIEnv * env;
    jobject thiz;
    jobject input_stream;
    jmethodID mid_available;
    jmethodID mid_read;
 };
 size_t inputStreamRead(void * ctx, void * output, size_t read_size) {
    struct input_stream_context* is = (struct input_stream_context*)ctx;
    jint avail_size = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
    jint size_to_copy = read_size < avail_size ? (jint)read_size : avail_size;
    jbyteArray byte_array = (*is->env)->NewByteArray(is->env, size_to_copy);
    jint n_read = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_read, byte_array, 0, size_to_copy);
    if (size_to_copy != read_size || size_to_copy != n_read) {
        LOGI("Insufficient Read: Req=%zu, ToCopy=%d, Available=%d", read_size, size_to_copy, n_read);
    }
    jbyte* byte_array_elements = (*is->env)->GetByteArrayElements(is->env, byte_array, NULL);
    memcpy(output, byte_array_elements, size_to_copy);
    (*is->env)->ReleaseByteArrayElements(is->env, byte_array, byte_array_elements, JNI_ABORT);
    (*is->env)->DeleteLocalRef(is->env, byte_array);
    is->offset += size_to_copy;
    return size_to_copy;
 }
 bool inputStreamEof(void * ctx) {
    struct input_stream_context* is = (struct input_stream_context*)ctx;
    jint result = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
    return result <= 0;
 }
 void inputStreamClose(void * ctx) {
 }
 JNIEXPORT jlong JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromInputStream(
        JNIEnv *env, jobject thiz, jobject input_stream) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
    struct whisper_model_loader loader = {};
    struct input_stream_context inp_ctx = {};
    inp_ctx.offset = 0;
    inp_ctx.env = env;
    inp_ctx.thiz = thiz;
    inp_ctx.input_stream = input_stream;
    jclass cls = (*env)->GetObjectClass(env, input_stream);
    inp_ctx.mid_available = (*env)->GetMethodID(env, cls, "available", "()I");
    inp_ctx.mid_read = (*env)->GetMethodID(env, cls, "read", "([BII)I");
    loader.context = &inp_ctx;
    loader.read = inputStreamRead;
    loader.eof = inputStreamEof;
    loader.close = inputStreamClose;
    loader.eof(loader.context);
    context = whisper_init(&loader);
    return (jlong) context;
 }
 static size_t asset_read(void *ctx, void *output, size_t read_size) {
    return AAsset_read((AAsset *) ctx, output, read_size);
 }
 static bool asset_is_eof(void *ctx) {
    return AAsset_getRemainingLength64((AAsset *) ctx) <= 0;
 }
 static void asset_close(void *ctx) {
    AAsset_close((AAsset *) ctx);
 }
 static struct whisper_context *whisper_init_from_asset(
        JNIEnv *env,
        jobject assetManager,
        const char *asset_path
 ) {
    LOGI("Loading model from asset '%s'\n", asset_path);
    AAssetManager *asset_manager = AAssetManager_fromJava(env, assetManager);
    AAsset *asset = AAssetManager_open(asset_manager, asset_path, AASSET_MODE_STREAMING);
    if (!asset) {
        LOGW("Failed to open '%s'\n", asset_path);
        return NULL;
    }
    whisper_model_loader loader = {
            .context = asset,
            .read = &asset_read,
            .eof = &asset_is_eof,
            .close = &asset_close
    };
    return whisper_init(&loader);
 }
 JNIEXPORT jlong JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromAsset(
        JNIEnv *env, jobject thiz, jobject assetManager, jstring asset_path_str) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
    const char *asset_path_chars = (*env)->GetStringUTFChars(env, asset_path_str, NULL);
    context = whisper_init_from_asset(env, assetManager, asset_path_chars);
    (*env)->ReleaseStringUTFChars(env, asset_path_str, asset_path_chars);
    return (jlong) context;
 }
 JNIEXPORT jlong JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
        JNIEnv *env, jobject thiz, jstring model_path_str) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
    const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
-    context = whisper_init(model_path_chars);
+    context = whisper_init_from_file(model_path_chars);
    (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
    return (jlong) context;
 }
--- a/examples/whisper.android/local.properties
+++ b/examples/whisper.android/local.properties
@ -1,10 +0,0 @@
 ## This file is automatically generated by Android Studio.
 # Do not modify this file -- YOUR CHANGES WILL BE ERASED!
 #
 # This file should *NOT* be checked into Version Control Systems,
 # as it contains information specific to your local configuration.
 #
 # Location of the SDK. This is only used by Gradle.
 # For customization when using a Version Control System, please read the
 # header note.
 sdk.dir=/Users/kevin/Library/Android/sdk
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -61,7 +61,7 @@ void AudioInputCallback(void * inUserData,
        NSLog(@"Loading model from %@", modelPath);
        // create ggml context
-        stateInp.ctx = whisper_init([modelPath UTF8String]);
+        stateInp.ctx = whisper_init_from_file([modelPath UTF8String]);
        // check if the model was loaded successfully
        if (stateInp.ctx == NULL) {
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -10,3 +10,5 @@ To use:
 5. Select the "release" build configuration under "Run", then deploy and run to your device.
 [^1]: I recommend the tiny, base or small models for running on an iOS device.
 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -55,7 +55,7 @@ actor WhisperContext {
    }
    static func createContext(path: String) throws -> WhisperContext {
-        let context = whisper_init(path)
+        let context = whisper_init_from_file(path)
        if let context {
            return WhisperContext(context: context)
        } else {
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Resources/models/.gitignore
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Resources/models/.gitignore
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Resources/samples/.gitignore
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Resources/samples/.gitignore
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -35,10 +35,10 @@
 		0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
 		0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		0AAC5DC629539EAF003032C3 /* WhisperCppDemo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "WhisperCppDemo-Bridging-Header.h"; sourceTree = "<group>"; };
-		0AAC5DC729539EB0003032C3 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../whisper.cpp; sourceTree = "<group>"; };
+		0AAC5DC729539EB0003032C3 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whisper.cpp; sourceTree = "<group>"; };
-		0AAC5DC829539EB0003032C3 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../whisper.h; sourceTree = "<group>"; };
+		0AAC5DC829539EB0003032C3 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whisper.h; sourceTree = "<group>"; };
-		0AAC5DC929539EB0003032C3 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
+		0AAC5DC929539EB0003032C3 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ggml.c; sourceTree = "<group>"; };
-		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
+		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ggml.h; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@ -129,7 +129,8 @@
 				0AAC5DC729539EB0003032C3 /* whisper.cpp */,
 				0AAC5DC829539EB0003032C3 /* whisper.h */,
 			);
-			path = whisper.cpp;
+			name = whisper.cpp;
 			path = ../..;
 			sourceTree = "<group>";
 		};
 		0AAC5DCF2953A36C003032C3 /* whisper.cpp.swift */ = {
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -32,8 +32,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
+    -s INITIAL_MEMORY=1500MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1500MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -18,7 +18,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    return i + 1;
                } else {
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -46,10 +46,12 @@
            <div id="model">
                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-tiny"    onclick="loadWhisper('tiny')">tiny (75 MB)</button>
+                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
-                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <button id="fetch-whisper-base"    onclick="loadWhisper('base')">base (142 MB)</button>
+                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
@ -60,8 +62,8 @@
            <!-- radio button to select between file upload or microphone -->
            <div id="input">
                Input:
-                <input type="radio" id="file" name="input" value="file" checked="checked" onchange="changeInput('file')" /> File
+                <input type="radio" id="file" name="input" value="file" checked="checked" onchange="changeInput('file')" /> <label for="file">File</label>
-                <input type="radio" id="mic" name="input" value="mic" onchange="changeInput('mic')" /> Microphone
+                <input type="radio" id="mic" name="input" value="mic" onchange="changeInput('mic')" /> <label for="mic">Microphone</label>
            </div>
            <br>
@ -284,27 +286,33 @@
                }
                reader.readAsArrayBuffer(file);
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
-                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
-                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
+                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
            function loadWhisper(model) {
                let urls = {
-                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'tiny.en':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'tiny':    'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
+                    'tiny':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
-                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'base.en':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
-                    'base':    'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
+                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
                };
                let sizes = {
-                    'tiny.en': 75,
+                    'tiny.en':  75,
-                    'tiny':    75,
+                    'tiny':     75,
-                    'base.en': 142,
+                    'base.en':  142,
-                    'base':    142,
+                    'base':     142,
                    'small.en': 466,
                    'small':    466,
                };
                let url     = urls[model];
@ -313,12 +321,14 @@
                model_whisper = model;
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
-                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
-                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
-                document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
+                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loading model: ' + model;
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
@ -327,12 +337,14 @@
                cbCancel = function() {
                    var el;
-                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-tiny'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('whisper-file'         ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
+                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'          ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'  ); if (el) el.innerHTML = '';
                };
                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
--- a/examples/yt-wsp.sh
+++ b/examples/yt-wsp.sh
@ -1,20 +1,10 @@
 #!/usr/bin/env bash
-
+# shellcheck disable=2086
 # Small shell script to more easily automatically download and transcribe live stream VODs.
 # This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
 # Use `./examples/yt-wsp.sh help` to print help info.
 #
 # Sample usage:
 #
 #   git clone https://github.com/ggerganov/whisper.cpp
 #   cd whisper.cpp
 #   make
 #   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
 #
 # MIT License
 # Copyright (c) 2022 Daniils Petrovs
 # Copyright (c) 2023 Jennifer Capasso
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@ -34,114 +24,181 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 # Small shell script to more easily automatically download and transcribe live stream VODs.
 # This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
 # Use `./examples/yt-wsp.sh help` to print help info.
 #
 # Sample usage:
 #
 #   git clone https://github.com/ggerganov/whisper.cpp
 #   cd whisper.cpp
 #   make
 #   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
 #
 set -Eeuo pipefail
-# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
+# get script file location
-MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
+SCRIPT_PATH="$(realpath -e ${BASH_SOURCE[0]})";
-WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
+SCRIPT_DIR="${SCRIPT_PATH%/*}"
-WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
+
 ################################################################################
 # Documentation on downloading models can be found in the whisper.cpp repo:
 # https://github.com/ggerganov/whisper.cpp/#usage
 #
 # note: unless a multilingual model is specified, WHISPER_LANG will be ignored
 # and the video will be transcribed as if the audio were in the English language
 ################################################################################
 MODEL_PATH="${MODEL_PATH:-${SCRIPT_DIR}/../models/ggml-base.en.bin}"
 ################################################################################
 # Where to find the whisper.cpp executable.  default to the examples directory
 # which holds this script in source control
 ################################################################################
 WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../main}";
 # Set to desired language to be translated into english
 WHISPER_LANG="${WHISPER_LANG:-en}";
 # Default to 4 threads (this was most performant on my 2020 M1 MBP)
 WHISPER_THREAD_COUNT="${WHISPER_THREAD_COUNT:-4}";
 msg() {
    echo >&2 -e "${1-}"
 }
 ################################################################################
 # create a temporary directory to work in
 # set the temp_dir and temp_filename variables
 ################################################################################
 temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
 temp_filename="${temp_dir}/yt-dlp-filename";
 ################################################################################
 # for now we only take one argument
 # TODO: a for loop
 ################################################################################
 source_url="${1}"
 title_name="";
 cleanup() {
-    msg "Cleaning up..."
+    local -r clean_me="${1}";
-    rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
+
    if [ -d "${clean_me}" ]; then
      msg "Cleaning up...";
      rm -rf "${clean_me}";
    else
      msg "'${clean_me}' does not appear to be a directory!";
      exit 1;
    fi;
 }
 print_help() {
    echo "################################################################################"
    echo "Usage: ./examples/yt-wsp.sh <video_url>"
-    echo "See configurable env variables in the script"
+    echo "# See configurable env variables in the script; there are many!"
-    echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
+    echo "# This script will produce an MP4 muxed file in the working directory; it will"
-    echo "Requirements: ffmpeg yt-dlp whisper"
+    echo "# be named for the title and id of the video."
-    echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
+    echo "# passing in https://youtu.be/VYJtb2YXae8 produces a file named";
-    echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
+    echo "# 'Why_we_all_need_subtitles_now-VYJtb2YXae8-res.mp4'"
    echo "# Requirements: ffmpeg yt-dlp whisper.cpp"
    echo "################################################################################"
 }
 check_requirements() {
    if ! command -v ffmpeg &>/dev/null; then
-        echo "ffmpeg is required (https://ffmpeg.org)."
+        echo "ffmpeg is required: https://ffmpeg.org";
        exit 1
-    fi
+    fi;
    if ! command -v yt-dlp &>/dev/null; then
-        echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
+        echo "yt-dlp is required: https://github.com/yt-dlp/yt-dlp";
-        exit 1
+        exit 1;
-    fi
+    fi;
    if ! command -v "${WHISPER_EXECUTABLE}" &>/dev/null; then
        echo "The C++ implementation of Whisper is required: https://github.com/ggerganov/whisper.cpp"
        echo "Sample usage:";
        echo "";
        echo "  git clone https://github.com/ggerganov/whisper.cpp";
        echo "  cd whisper.cpp";
        echo "  make";
        echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890";
        echo "";
        exit 1;
    fi;
    if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
        WHISPER_EXECUTABLE="./main"
        if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
            echo "Sample usage:"
            echo ""
            echo "  git clone https://github.com/ggerganov/whisper.cpp"
            echo "  cd whisper.cpp"
            echo "  make"
            echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
            echo ""
            exit 1
        fi
    fi
 }
-if [[ $# -lt 1 ]]; then
+if [[ "${#}" -lt 1 ]]; then
-    print_help
+    print_help;
-    exit 1
+    exit 1;
 fi
-if [[ "$1" == "help" ]]; then
+if [[ "${1##-*}" == "help" ]]; then
-    print_help
+    print_help;
-    exit 0
+    exit 0;
 fi
-temp_dir="tmp"
+check_requirements;
 source_url="$1"
-check_requirements
+msg "Downloading VOD...";
-msg "Downloading VOD..."
+################################################################################
-
+# Download the video, put the dynamic output filename into a variable.
-# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
+# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]
 # for videos only available to logged-in users.
 ################################################################################
 yt-dlp \
    -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
    -o "${temp_dir}/%(title)s-%(id)s.vod.mp4" \
    --print-to-file "%(filename)s" "${temp_filename}" \
    --no-simulate \
    --no-write-auto-subs \
    --restrict-filenames \
    --embed-thumbnail \
    --embed-chapters \
    --xattrs \
-    "${source_url}" -o "${temp_dir}/vod.mp4"
+    "${source_url}";
-msg "Extracting audio and resampling..."
+title_name="$(xargs basename -s .vod.mp4 < ${temp_filename})";
-ffmpeg -i "${temp_dir}/vod.mp4" \
+msg "Extracting audio and resampling...";
 ffmpeg -i "${temp_dir}/${title_name}.vod.mp4"  \
    -hide_banner \
    -vn \
    -loglevel error \
    -ar 16000 \
    -ac 1 \
-    -c:a \
+    -c:a pcm_s16le \
-    pcm_s16le -y "vod-resampled.wav"
+    -y \
    "${temp_dir}/${title_name}.vod-resampled.wav";
-msg "Transcribing to subtitle file..."
+msg "Transcribing to subtitle file...";
-msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
+msg "Whisper specified at: '${WHISPER_EXECUTABLE}'";
-$WHISPER_EXECUTABLE \
+"${WHISPER_EXECUTABLE}" \
    -m "${MODEL_PATH}" \
    -l "${WHISPER_LANG}" \
-    -f "vod-resampled.wav" \
+    -f "${temp_dir}/${title_name}.vod-resampled.wav" \
-    -t 8 \
+    -t "${WHISPER_THREAD_COUNT}" \
    -osrt \
-    --translate
+    --translate;
-msg "Embedding subtitle track..."
+msg "Embedding subtitle track...";
-ffmpeg -i "${temp_dir}/vod.mp4" \
+ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
    -hide_banner \
    -loglevel error \
-    -i "vod-resampled.wav.srt" \
+    -i "${temp_dir}/${title_name}.vod-resampled.wav.srt" \
    -c copy \
    -c:s mov_text \
-    -y res.mp4
+    -y "${title_name}-res.mp4";
-cleanup
+cleanup "${temp_dir}";
-msg "Done! Your finished file is ready: res.mp4"
+msg "Done! Your finished file is ready: ${title_name}-res.mp4";
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -12,6 +12,18 @@ fi
 models=( "tiny" "base" "small" "medium" "large" )
 printf "\n"
 printf "Running memcpy benchmark with 1 thread\n"
 printf "\n"
 ./bench -w 1 -t 1 2>&1
 printf "\n"
 printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
 printf "\n"
 ./bench -w 2 -t $n_threads 2>&1
 printf "\n"
 printf "Running benchmark for all models\n"
 printf "This can take a while!\n"
@ -56,4 +68,3 @@ for model in "${models[@]}"; do
    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
 done
--- a/ggml.c
+++ b/ggml.c
@ -84,7 +84,7 @@ typedef void* thread_ret_t;
 #define GGML_GELU_FP16
 #define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  4
+#define GGML_VEC_DOT_UNROLL  2
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
@ -339,8 +339,12 @@ int64_t ggml_cycles_per_ms(void) {
 #if defined(__cpp_lib_hardware_interference_size)
 #define CACHE_LINE_SIZE hardware_destructive_interference_size
 #else
 #if defined(__POWER9_VECTOR__)
 #define CACHE_LINE_SIZE 128
 #else
 #define CACHE_LINE_SIZE 64
 #endif
 #endif
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
@ -609,9 +613,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_F16_VEC_STORE(p, r, i)                                      \
+#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-  if (i & 0x1)                                                           \
+#define GGML_F16_VEC_STORE(p, r, i)                             \
-    vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR)
+  if (i & 0x1)                                                  \
    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
                                   r[i - GGML_ENDIAN_BYTE(0)]), \
            0, p - GGML_F16_EPR)
 #elif defined(__wasm_simd128__)
@ -923,9 +930,9 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
 inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-    const ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL] = { xv };
+    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-    for (int i = 1; i < GGML_VEC_DOT_UNROLL; ++i) {
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
    }
@ -1109,8 +1116,8 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
    ggml_float sum = 0.0;
    for (int i = 0; i < n; ++i) {
        sum += x[i];
        *s += sum;
    }
    *s = sum;
 #else
    vDSP_sve(x, 1, s, n);
 #endif
@ -1251,7 +1258,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 //
 struct ggml_object {
-    size_t offset;
+    size_t offs;
    size_t size;
    struct ggml_object * next;
@ -1277,6 +1284,9 @@ struct ggml_context {
    struct ggml_object * objects_begin;
    struct ggml_object * objects_end;
    struct ggml_scratch scratch;
    struct ggml_scratch scratch_save;
 };
 struct ggml_context_container {
@ -1339,7 +1349,7 @@ inline static void ggml_critical_section_end(void) {
 void ggml_print_object(const struct ggml_object * obj) {
    GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
-            obj->offset, obj->size, (const void *) obj->next);
+            obj->offs, obj->size, (const void *) obj->next);
 }
 void ggml_print_objects(const struct ggml_context * ctx) {
@ -1535,12 +1545,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    }
    *ctx = (struct ggml_context) {
-        .mem_size         = params.mem_size,
+        /*.mem_size         =*/ params.mem_size,
-        .mem_buffer       = params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
-        .mem_buffer_owned = params.mem_buffer ? false : true,
+        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
-        .n_objects        = 0,
+        /*.n_objects        =*/ 0,
-        .objects_begin    = NULL,
+        /*.objects_begin    =*/ NULL,
-        .objects_end      = NULL,
+        /*.objects_end      =*/ NULL,
        /*.scratch          =*/ { 0, 0, NULL, },
        /*.scratch_save     =*/ { 0, 0, NULL, },
    };
    ggml_assert_aligned(ctx->mem_buffer);
@ -1563,7 +1575,7 @@ void ggml_free(struct ggml_context * ctx) {
            g_state.contexts[i].used = false;
            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offset + ctx->objects_end->size);
+                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
            if (ctx->mem_buffer_owned) {
                free(ctx->mem_buffer);
@ -1582,7 +1594,15 @@ void ggml_free(struct ggml_context * ctx) {
 }
 size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end->offset + ctx->objects_end->size;
+    return ctx->objects_end->offs + ctx->objects_end->size;
 }
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
    ctx->scratch = scratch;
    return result;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -1596,9 +1616,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
    // always insert objects at the end of the context's memory pool
    struct ggml_object * obj_cur = ctx->objects_end;
-    const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset;
+    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
-    const size_t cur_size   = obj_cur == NULL ? 0 : obj_cur->size;
+    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end    = cur_offset + cur_size;
+    const size_t cur_end  = cur_offs + cur_size;
    size_t size_needed = 0;
@ -1609,25 +1629,52 @@ struct ggml_tensor * ggml_new_tensor_impl(
        }
        // align to GGML_MEM_ALIGN
        size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
    }
    size_needed += sizeof(struct ggml_tensor);
    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_PRINT("%s: not enough space in the context's memory pool\n", __func__);
        assert(false);
        return NULL;
    }
    char * const mem_buffer = ctx->mem_buffer;
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-    *obj_new = (struct ggml_object) {
+    if (ctx->scratch.data == NULL || data != NULL) {
-        .offset = cur_end + GGML_OBJECT_SIZE,
+        size_needed += sizeof(struct ggml_tensor);
-        .size   = size_needed,
+
-        .next   = NULL,
+        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-    };
+            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                    __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
            assert(false);
            return NULL;
        }
        *obj_new = (struct ggml_object) {
            .offs = cur_end + GGML_OBJECT_SIZE,
            .size = size_needed,
            .next = NULL,
        };
    } else {
        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
            assert(false);
            return NULL;
        }
        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
            assert(false);
            return NULL;
        }
        data = (char * const) ctx->scratch.data + ctx->scratch.offs;
        *obj_new = (struct ggml_object) {
            .offs = cur_end + GGML_OBJECT_SIZE,
            .size = sizeof(struct ggml_tensor),
            .next = NULL,
        };
        //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
        ctx->scratch.offs += size_needed;
    }
    if (obj_cur != NULL) {
        obj_cur->next = obj_new;
@ -1638,9 +1685,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
    ctx->objects_end = obj_new;
-    //GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end);
+    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
-    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset);
+    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
    ggml_assert_aligned(result);
@ -1683,7 +1730,7 @@ struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
-        const int* ne) {
+        const int * ne) {
    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
 }
@ -1725,16 +1772,26 @@ struct ggml_tensor * ggml_new_tensor_4d(
 }
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    ctx->scratch_save = ctx->scratch;
    ctx->scratch.data = NULL;
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
    ctx->scratch = ctx->scratch_save;
    ggml_set_i32(result, value);
    return result;
 }
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
    ctx->scratch_save = ctx->scratch;
    ctx->scratch.data = NULL;
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
    ctx->scratch = ctx->scratch_save;
    ggml_set_f32(result, value);
    return result;
@ -2343,7 +2400,7 @@ struct ggml_tensor * ggml_repeat(
    result->op   = GGML_OP_REPEAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
-    result->src1 = NULL;
+    result->src1 = b;
    return result;
 }
@ -2959,9 +3016,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
    // TODO: when implement backward, fix this:
    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
+    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
    ((int32_t *) b->data)[0] = n_past;
    result->op   = GGML_OP_DIAG_MASK_INF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -3724,8 +3779,6 @@ static void ggml_compute_forward_sum_f32(
    assert(ggml_is_scalar(dst));
    assert(src0->nb[0] == sizeof(float));
    *(float *) (dst->data) = 0.0f;
    const int ne00 = src0->ne[0];
    const int ne01 = src0->ne[1];
    const int ne02 = src0->ne[2];
@ -3811,8 +3864,6 @@ static void ggml_compute_forward_mean_f32(
    for (int i03 = 0; i03 < ne03; i03++) {
        for (int i02 = 0; i02 < ne02; i02++) {
            for (int i01 = 0; i01 < ne01; i01++) {
                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) = 0.0f;
                ggml_vec_sum_f32(ne00,
                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
@ -4297,7 +4348,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
    const int ne1 = dst->ne[1];
    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && (
             (ne0 >= 32 && ne1  >= 32   && ne10 >= 32)
            )) {
        //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
        return true;
    }
@ -4377,7 +4430,9 @@ static void ggml_compute_forward_mul_mat_f32(
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        GGML_ASSERT(nb10 == sizeof(float));
-        if (params->ith != 0) return;
+        if (params->ith != 0) {
            return;
        }
        if (params->type == GGML_TASK_INIT) {
            return;
@ -4620,7 +4675,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
        GGML_ASSERT(nb10 == sizeof(float));
-        if (params->ith != 0) return;
+        if (params->ith != 0) {
            return;
        }
        if (params->type == GGML_TASK_INIT) {
            return;
@ -4791,7 +4848,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
            }
        }
    } else {
-        // parallelize by src1 columns using ggml_vec_mad_f32
+        // parallelize by src1 columns using ggml_vec_mad_f16
        // each thread has its own work data
        // during FINALIZE we accumulate all work data into dst
@ -6158,40 +6215,37 @@ static void ggml_compute_forward_flash_attn_f16(
            S[i] = -INFINITY;
        }
-        // looks like unrolling here does not help
+        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-#if 1
+            for (int ic = 0; ic < nek1; ++ic) {
-        for (int ic = 0; ic < nek1; ++ic) {
+                // k indices
-            // k indices
+                const int ik3 = iq3;
-            const int ik3 = iq3;
+                const int ik2 = iq2;
-            const int ik2 = iq2;
+                const int ik1 = ic;
            const int ik1 = ic;
-            // S indices
+                // S indices
-            const int i1 = ik1;
+                const int i1 = ik1;
-            ggml_vec_dot_f16(neq0,
+                ggml_vec_dot_f16(neq0,
-                    S + i1,
+                        S + i1,
-                    (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                    (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
            }
        } else {
            for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
                // k indices
                const int ik3 = iq3;
                const int ik2 = iq2;
                const int ik1 = ic;
                // S indices
                const int i1 = ik1;
                ggml_vec_dot_f16_unroll(neq0, nbk1,
                        S + i1,
                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
            }
        }
 #else
        GGML_ASSERT(nek1 % GGML_VEC_DOT_UNROLL == 0);
        for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
            // k indices
            const int ik3 = iq3;
            const int ik2 = iq2;
            const int ik1 = ic;
            // S indices
            const int i1 = ik1;
            ggml_vec_dot_f16_unroll(neq0, nbk1,
                    S + i1,
                                    ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
                    (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
        }
 #endif
        // scale
        ggml_vec_scale_f32(nek1, S, scale);
@ -6261,18 +6315,30 @@ static void ggml_compute_forward_flash_attn_f16(
            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }
-        GGML_ASSERT(nev1 % GGML_VEC_DOT_UNROLL == 0);
+        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
            for (int ic = 0; ic < nev1; ++ic) {
                // dst indices
                const int i1 = iq1;
                const int i2 = iq2;
                const int i3 = iq3;
-        for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
+                ggml_vec_dot_f16(nek1,
-            // dst indices
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-            const int i1 = iq1;
+                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-            const int i2 = iq2;
+                        S16);
-            const int i3 = iq3;
+            }
        } else {
            for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
                // dst indices
                const int i1 = iq1;
                const int i2 = iq2;
                const int i3 = iq3;
-            ggml_vec_dot_f16_unroll(nek1, nbv1,
+                ggml_vec_dot_f16_unroll(nek1, nbv1,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
+                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                              ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
+                        ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                    S16);
+                        S16);
            }
        }
    }
 }
@ -7049,7 +7115,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 #ifdef __APPLE__
 //#include <os/lock.h>
-
+//
 //typedef os_unfair_lock ggml_lock_t;
 //
 //#define ggml_lock_init(x)    UNUSED(x)
@ -7156,6 +7222,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            if (state->params.ith < state->params.nth) {
                ggml_compute_forward(&state->params, state->node);
            }
            state->node = NULL;
        } else {
            break;
@ -7200,6 +7267,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                .node   = NULL,
                .shared = &state_shared,
            };
            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
            assert(rc == 0);
            UNUSED(rc);
@ -7268,8 +7336,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                                node->src1->type == GGML_TYPE_F32) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                    node->n_tasks = 1;
+                                    node->n_tasks = 1; // TODO: this actually is doing nothing
                                                       //       the threads are still spinning
                                    cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
                                    //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
                                    //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
                                    //printf("cur = %zu\n", cur);
                                } else {
                                    cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
                                }
--- a/ggml.h
+++ b/ggml.h
@ -301,6 +301,13 @@ struct ggml_cgraph {
    int64_t perf_time_us;
 };
 // scratch buffer
 struct ggml_scratch {
    size_t offs;
    size_t size;
    void * data;
 };
 struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
@ -327,6 +334,8 @@ void ggml_free(struct ggml_context * ctx);
 size_t ggml_used_mem(const struct ggml_context * ctx);
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -1,6 +1,7 @@
 #ifndef WHISPER_H
 #define WHISPER_H
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@ -40,7 +41,7 @@ extern "C" {
    //
    //     ...
    //
-    //     struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
+    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
    //
    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
    //         fprintf(stderr, "failed to process audio\n");
@ -73,6 +74,7 @@ extern "C" {
        whisper_token tid; // forced timestamp token id
        float p;           // probability of the token
        float plog;        // log probability of the token
        float pt;          // probability of the timestamp token
        float ptsum;       // sum of probabilities of all timestamp tokens
@ -84,9 +86,20 @@ extern "C" {
        float vlen;        // voice length of the token
    } whisper_token_data;
-    // Allocates all memory needed for the model and loads the model from the given file.
+    typedef struct whisper_model_loader {
-    // Returns NULL on failure.
+        void * context;
-    WHISPER_API struct whisper_context * whisper_init(const char * path_model);
+
        size_t (*read)(void * ctx, void * output, size_t read_size);
        bool    (*eof)(void * ctx);
        void  (*close)(void * ctx);
    } whisper_model_loader;
    // Various functions for loading a ggml whisper model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
    // Frees all memory allocated by the model.
    WHISPER_API void whisper_free(struct whisper_context * ctx);
@ -100,6 +113,16 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);
    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
        struct whisper_context* ctx,
        const float* samples,
        int   n_samples,
        int   n_threads);
    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
@ -124,6 +147,7 @@ extern "C" {
    // tokens + n_tokens is the provided context for the decoder.
    // n_past is the number of tokens to use from previous decoder calls.
    // Returns 0 on success
    // TODO: add support for multiple decoders
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
               const whisper_token * tokens,
@ -131,14 +155,6 @@ extern "C" {
                               int   n_past,
                               int   n_threads);
    // Token sampling methods.
    // These are provided for convenience and can be used after each call to whisper_decode().
    // You can also implement your own sampling method using the whisper_get_probs() function.
    // whisper_sample_best() returns the token with the highest probability
    // whisper_sample_timestamp() returns the most probable timestamp token
    WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
    WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -180,8 +196,11 @@ extern "C" {
    WHISPER_API int whisper_n_audio_ctx    (struct whisper_context * ctx);
    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
-    // The probabilities for the next token
+    // Token logits obtained from the last call to whisper_decode()
-    WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
+    // The logits for the last token are stored in the last row
    // Rows: n_tokens
    // Cols: n_vocab
    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
@ -210,8 +229,8 @@ extern "C" {
    // Available sampling strategies
    enum whisper_sampling_strategy {
-        WHISPER_SAMPLING_GREEDY,      // Always select the most probable token
+        WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreefyDecoder
-        WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
+        WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
    };
    // Text segment callback
@ -231,30 +250,32 @@ extern "C" {
        enum whisper_sampling_strategy strategy;
        int n_threads;
-        int n_max_text_ctx;
+        int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
        int offset_ms;          // start offset in ms
        int duration_ms;        // audio duration to process in ms
        bool translate;
-        bool no_context;
+        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
        bool single_segment;    // force single segment output (useful for streaming)
-        bool print_special;
+        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-        bool print_progress;
+        bool print_progress;    // print progress information
-        bool print_realtime;
+        bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-        bool print_timestamps;
+        bool print_timestamps;  // print timestamps for each text segment when printing realtime
        // [EXPERIMENTAL] token-level timestamps
        bool  token_timestamps; // enable token-level timestamps
        float thold_pt;         // timestamp token probability threshold (~0.01)
        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
        int   max_len;          // max segment length in characters
        bool  split_on_word;    // split on word rather than on token (when used with max_len)
        int   max_tokens;       // max tokens per segment (0 = no limit)
        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
        int  audio_ctx;         // overwrite the audio context size (0 = use default)
-        // tokens to provide the whisper model as initial prompt
+        // tokens to provide to the whisper decoder as initial prompt
        // these are prepended to any existing text context from a previous call
        const whisper_token * prompt_tokens;
        int prompt_n_tokens;
@ -262,19 +283,36 @@ extern "C" {
        // for auto-detection, set to nullptr, "" or "auto"
        const char * language;
        // common decoding parameters:
        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
        float length_penalty;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
        // fallback parameters
        // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
        float temperature_inc;
        float entropy_thold;    // similar to OpenAI's "compression_ratio_threshold"
        float logprob_thold;
        float no_speech_thold;  // TODO: not implemented
        struct {
-            int n_past;
+            int best_of;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
        } greedy;
        struct {
-            int n_past;
+            int beam_size;  // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
-            int beam_width;
+
-            int n_best;
+            float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
        } beam_search;
        // called for every newly generated text segment
        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
    };
@ -303,6 +341,9 @@ extern "C" {
    // A segment can be a few words, a sentence, or even a paragraph.
    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
    // Language id associated with the current context
    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
    // Get the start and end time of the specified segment.
    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
@ -324,6 +365,13 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
    ////////////////////////////////////////////////////////////////////////////
    // Temporary helpers needed for exposing ggml interface
    WHISPER_API int whisper_bench_memcpy(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
 #ifdef __cplusplus
 }
 #endif
Author	SHA1	Message	Date
Georgi Gerganov	59c997ca2d	wip ignore	2023-02-15 19:11:12 +02:00
genevera (she/her)	459753342d	yt-wsp.sh : add unique filename generation (#495 ) Co-authored-by: genevera <genevera@noreply.users.github.com>	2023-02-14 20:12:51 +02:00
Georgi Gerganov	9764782bd9	readme : add another .NET repo (#303 )	2023-02-14 20:04:03 +02:00
Georgi Gerganov	3b010f9bed	readme : add .NET repo (#303 )	2023-02-11 17:35:33 +02:00
Avik Sengupta	113fcec513	cmake : install whisper.h header (#485 ) Including the header file in the install bundle helps projects that ship binaries.	2023-02-11 09:13:32 +02:00
shibukazu	cfc06bf8df	whisper : suppress non-speech-related token outputs (#473 ) * add non-speech-token suppression * add suppress non-speech_tokens param	2023-02-08 09:05:34 +02:00
sandrohanea	2bfe0ebc0f	whisper : fixed Beam Search Strategy and exposed whisper_pcm_to_mel_phase_vocoder (#474 ) Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>	2023-02-08 09:01:47 +02:00
boolemancer	4dd7119deb	whisper : only trim if split_on_word is true (#476 )	2023-02-08 08:43:23 +02:00
Qianhe Chen	ab1916fc59	ci : add node addon test and optimize compilation configuration (#468 ) * addon: implement node addon call whisper through cpp * addon: modify the license to MIT * addon: remove iostream * addon: rename dir * addon: fix typo * addon: configure cmake to build when cmake-js is used * ci: add addon.node test ci * addon: remove build WHISPER_BUILD_TESTS * addon: update build command * addon: add test * addon: add test file * addon: adapt to compile on Windows * addon: fix typo * addon: reuse jfk.wav Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * addon: reuse jfk.wav --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-02-05 15:02:08 +02:00
kamranjon	a1c1583cc7	whisper : add whisper_full_lang_id() for getting the context lang (#461 )	2023-02-05 14:46:26 +02:00
Matija Pevec	d012b5c7e4	whisper : add "split_on_word" flag when using using "max_len" option (#455 ) * Update whisper.cpp * fix: trim function * feat: added flag to split on word * fix: arguments for main	2023-02-05 14:44:23 +02:00
Georgi Gerganov	b2083c5d02	release : v1.2.0	2023-02-04 09:49:49 +02:00
Georgi Gerganov	f3ee4a9673	whisper : reduce memory usage during inference (#431 ) * ggml : add "scratch" buffer support * ggml : support for scratch ring-buffer * ggml : bug fix in ggml_repeat() * ggml : error on scratch buffer overflow * whisper : use scratch buffers during inference (base model only) * whisper : update memory usage for all models * whisper : fix encoder memory usage * whisper : use whisper_context functions instead of macros * whisper : fix FF + remove it from README * ggml : reuse ggml_new_i32 * ggml : refactor the scratch buffer storage * whisper : reorder scratch buffers in the decoder * main : add option to disable temp fallback * Update README.md	2023-02-04 09:45:52 +02:00
Qianhe Chen	c306a7fd89	addon.node : using whisper as a Node.js addon (#443 ) * addon: implement node addon call whisper through cpp * addon: modify the license to MIT * addon: remove iostream * addon: rename dir * addon: fix typo * addon: configure cmake to build when cmake-js is used	2023-02-04 09:10:25 +02:00
polarmoon	b2fc4c7010	go : support "auto" as an option when set language (#462 ) Co-authored-by: Ming <ming@localhost>	2023-02-04 09:09:27 +02:00
Georgi Gerganov	291980369c	whisper : suppress task tokens (#442 )	2023-02-04 09:03:14 +02:00
Taisei Mima	86ef64a855	wasm : fix typo in helper.js (#459 )	2023-02-04 08:49:15 +02:00
Alex Bacart	3b1960520a	main : CSV format export trimmed spaces fix (#444 ) * Update main.cpp Removed string trimming * Update main.cpp * Update main.cpp * Revert "Update main.cpp" This reverts commit `d8924fdcfe`. * Revert "Update main.cpp" This reverts commit `252e508d85`.	2023-02-04 08:48:35 +02:00
Lukas Rist	2bee2650c6	go : add wrapper for system info (#456 )	2023-01-28 18:44:56 +02:00
Robin	beb9512be3	go : add WhisperLangAutoDetect method to go binding (#451 )	2023-01-27 01:14:20 +02:00
Eric Tendian	47737b2e82	livestream.sh : run main with model arg instead of default (#453 ) Actually utilizes the $model var when calling ./main.	2023-01-27 01:13:31 +02:00
Georgi Gerganov	b992f3709e	whisper : do not provide past prompt when n_max_text_ctx == 0	2023-01-25 20:01:00 +02:00
Georgi Gerganov	60337f5306	wasm : check if navigator.storage.estimate() is available Safari does not support it	2023-01-25 20:00:59 +02:00
Lukas Rist	02c7516c57	go : added wrappers to reset and print timings (#436 )	2023-01-25 18:57:30 +02:00
Georgi Gerganov	411ea9b833	ci : run workflows on pull requests + bindings depend on .h (#446 )	2023-01-25 18:50:50 +02:00
Ondrej Kokes	11f61cecd6	whisper.wasm : add labels for easier radio selection (#435 )	2023-01-23 20:49:00 +02:00
Georgi Gerganov	b5ddb16ec7	whisper : condition timestamps to be monotonically increasing (#425 )	2023-01-23 20:48:26 +02:00
fitzsim	ae16c21e9c	whisper : PPC64 big-endian support (#398 ) * ggml : set cache line size to 128 on POWER9 * whisper : add PPC64 big endian support	2023-01-23 20:48:10 +02:00
Georgi Gerganov	2c3f50a021	release : v1.1.1	2023-01-23 20:23:44 +02:00
Georgi Gerganov	9a65269a20	.gitignore : add arm_neon.h	2023-01-23 20:19:04 +02:00
Georgi Gerganov	78f166174f	whisper : fix condition for providing past prompt (critical) This bug has been present since v1.1.0. Effectively, the past transcribed text wasn't being used for following transcriptions, which likely significantly reduces the transcription quality. Likely related to #419	2023-01-22 10:47:01 +02:00
Georgi Gerganov	21c569ba4a	whisper : extend information in whisper_print_timings()	2023-01-19 18:50:33 +02:00
Georgi Gerganov	1a91c19af9	whisper : perform entropy check only when we have at least 32 tokens (#412 )	2023-01-18 22:52:18 +02:00
Georgi Gerganov	f583e2d2f5	main : we had accidentally disabled the temperature fallback .. (#291 )	2023-01-18 22:51:41 +02:00
Georgi Gerganov	206fc93396	whisper.wasm : add small and small.en models	2023-01-18 21:58:55 +02:00
Georgi Gerganov	a6cf6f4c4a	bench : minor fixes	2023-01-18 21:40:10 +02:00
Chia-Hsiang Cheng	472a473fd1	main : add an option to accept optional output filenames (#424 ) * Add an option to accept optional output filenames * Format the file Co-authored-by: Chia-Hsiang Cheng <gary.chiahsiang.cheng@gmail.com>	2023-01-18 21:26:31 +02:00
Georgi Gerganov	9ba66c2fad	stream : fix handling of --step == --length (#416 )	2023-01-18 21:22:52 +02:00
Georgi Gerganov	1ccb8a46a5	bench : fix Windows linkage by moving ggml benches in whisper lib ..	2023-01-18 21:19:50 +02:00
Georgi Gerganov	1290fc6457	bench : add memcpy and ggml_mul_mat benchmarks	2023-01-18 20:31:46 +02:00
Digipom	49b529ba74	whisper.android : add support for loading directly from asset in C (#415 )	2023-01-16 21:57:35 +02:00
Georgi Gerganov	8088a977af	whisper : fix possible uninitialized variables (#291 )	2023-01-16 21:44:40 +02:00
Georgi Gerganov	c9aeb33676	stream : fix --keep_context argument to be used correctly (#354 )	2023-01-16 19:37:40 +02:00
Damian Czaja	4a3f0d3fe9	go : remove sample_best and sample_timestamp bindings (#409 )	2023-01-16 19:18:10 +02:00
Georgi Gerganov	874bde887e	Update README.md	2023-01-16 18:47:31 +02:00
Georgi Gerganov	8738427dd6	cmake : bump version to 1.1.0	2023-01-15 14:33:13 +02:00
Georgi Gerganov	c3991bbb24	Update README.md	2023-01-15 14:08:12 +02:00
Georgi Gerganov	00ea21668b	whisper : account speed_up flag for short audio (close #405 )	2023-01-15 12:42:15 +02:00
Georgi Gerganov	0b85e8c401	Update README.md	2023-01-15 11:36:20 +02:00
Georgi Gerganov	fafd78945d	bench.wasm : print system info	2023-01-15 11:34:03 +02:00
Georgi Gerganov	8de452c18b	Improve decoding (#291 ) * whisper : prepare infra for new decoding strategies * whisper : apply logit filters and compute logprobs * whisper : add whisper_get_logits() * whisper : separate self and cross attention memory Initial step needed for supporting parallel decoders * whisper : move probs_id buffer to whisper_context * whisper : refactor kv cache into separate struct * whisper : move self-attention kv cache to whisper_decoder * whisper : wip decoding parameters + strategies * whisper : wip decoding parameters + strategies (part 2) * whisper : wip decoding parameters + strategies (part 3) * whisper : wip decoding parameters + strategies (part 4) * whisper : fix prompt_past update to not include prompt_init * whisper : temperature + best_of support * whisper : support for compression_ration_threshold We actually use entropy, but it is similar * command : fix example to use logits instead of obsolete probs * whisper : handle empty sequence ranking * whisper : add WHISPER_DEBUG + diagnostic prints + new main args * whisper : minor fixes * whisper : add beam-search support * whisper : bug fix when there no previous context * whisper : add comments * stream : disable temperature fallback For real-time processing, we always want a single decoder running at T=0 * whisper.swiftui : update example - fix paths + add empty folders	2023-01-15 11:29:57 +02:00
Georgi Gerganov	a6dbd9188b	stream : fix a bug that inserted a lot of empty audio at the start The quality was terrible due to this	2023-01-14 19:20:47 +02:00
Georgi Gerganov	4ef3398e8f	ggml : remove obsolete zeroing + comment fixes (#390 )	2023-01-08 20:21:03 +02:00
Ian Bicking	5e9f33596f	readme : clarify main and stream usage (#391 ) Give an example of ./main that uses a sample file that's already there, and make the stream example clarify you need `make stream`	2023-01-08 20:18:41 +02:00
Abitofevrything	8d7b29cedd	ggml : correct behaviour of ggml_vec_sum_f32 (#390 )	2023-01-08 20:06:09 +02:00
boolemancer	08dc705a69	whisper : fix sample_to_timestamp calculation with 64 bit precision to avoid overflow (#388 ) * Do calculation with 64 bit precision to avoid overflow * Update whisper.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-01-08 15:08:45 +02:00
Syahmi Azhar	1512545149	whisper : add loader class to allow loading from buffer and others (#353 ) * whisper : add loader to allow loading from other than file * whisper : rename whisper_init to whisper_init_from_file * whisper : add whisper_init_from_buffer * android : Delete local.properties * android : load models directly from assets * whisper : adding <stddef.h> needed for size_t + code style Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-01-08 13:03:33 +02:00
Georgi Gerganov	52a3e0c92a	ggml : improve vec_dot_f16 unrolling in flash_attn_f16	2023-01-08 11:41:18 +02:00
Georgi Gerganov	d1ea1220ff	command : clean-up / refactoring / formatting (#383 )	2023-01-07 21:43:24 +02:00
David	9c4a1522f6	command : always-prompt mode (#383 )	2023-01-07 21:41:11 +02:00
David Thorpe	f078a6f20e	go : adding features to the go-whisper example, go ci, etc (#384 ) * Updated bindings so they can be used in third pary packages. * Updated makefiles to set FMA flag on optionally, for xeon E5 on Darwin * Added test script * Changes for examples * Reverted * Made the NewContext method private	2023-01-07 21:21:43 +02:00