ggml : use macros to inline FP16 <-> FP32 conversions

ggml : add F16C CPU flag check
add fp16/fp32 convert intrinsics
2025-06-24 17:15:19 +00:00 · 2022-12-06 22:05:33 +02:00 · 2022-12-06 21:56:56 +02:00 · 2022-12-06 21:44:24 +02:00 · 2022-12-06 18:48:57 +02:00 · 2022-12-06 18:47:48 +02:00
63 changed files with 7199 additions and 1017 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -113,3 +113,73 @@ jobs:
              run: |
                make
                ctest -L gh --output-on-failure
+
+    windows:
+        runs-on: windows-latest
+
+        strategy:
+            matrix:
+                build: [RelWithDebInfo]
+                arch: [Win32, x64]
+                blas: [ON]
+                sdl2: [ON]
+                include:
+                  - arch: Win32
+                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+                    s2arc: x86
+                  - arch: x64
+                    obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+                    s2arc: x64
+                  - sdl2: ON
+                    s2ver: 2.26.0
+
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1
+
+            - name: Add msbuild to PATH
+              uses: microsoft/setup-msbuild@v1
+
+            - name: Fetch OpenBLAS
+              if: matrix.blas == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+                7z x blas.zip -oblas -y
+                copy blas/include/cblas.h .
+                copy blas/include/openblas_config.h .
+                echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+
+            - name: Fetch SDL2 and set SDL2_DIR
+              if: matrix.sdl2 == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+                7z x sdl2.zip
+                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+
+            - name: Configure
+              run: >
+                cmake -S . -B ./build -A ${{ matrix.arch }}
+                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
+                -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+
+            - name: Build
+              run: |
+                cd ./build
+                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+
+            - name: Copy libopenblas.dll
+              if: matrix.blas == 'ON'
+              run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+
+            - name: Copy SDL2.dll
+              if: matrix.sdl2 == 'ON'
+              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+
+            - name: Upload binaries
+              if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
+              uses: actions/upload-artifact@v1
+              with:
+                name: whisper-bin-${{ matrix.arch }}
+                path: build/bin/${{ matrix.build }}
--- a/.gitignore
+++ b/.gitignore
@ -13,10 +13,12 @@ build-sanitize-thread/

 main
 stream
+command
 bench
 sync.sh
 compile_commands.json

+examples/arm_neon.h
 examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
 examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "bindings/ios"]
+	path = bindings/ios
+	url = https://github.com/ggerganov/whisper.spm
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,6 +9,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
    include(cmake/GitVars.cmake)
    include(cmake/BuildTypes.cmake)
+
+    # configure project version
+    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
+        configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
+    endif()
 else()
    set(WHISPER_STANDALONE OFF)
 endif()
@ -43,6 +48,8 @@ option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)

 if (APPLE)
    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
+    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
+    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -92,7 +99,9 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE)
 endif()

 if (WHISPER_SUPPORT_OPENBLAS)
-    find_library(OPENBLAS_LIB openblas)
+    find_library(OPENBLAS_LIB
+        NAMES openblas libopenblas
+        )
    if (OPENBLAS_LIB)
        message(STATUS "OpenBLAS found")

@ -138,15 +147,21 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
 else()
    message(STATUS "x86 detected")
    if (MSVC)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /arch:AVX2")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
    else()
        if (EMSCRIPTEN)
            # we require support for WASM SIMD 128-bit
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
        else()
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
+            if(NOT WHISPER_NO_AVX)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+            endif()
+            if(NOT WHISPER_NO_AVX2)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
        endif()
    endif()
 endif()
--- a/78
+++ b/78
@ -50,11 +50,36 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),x86_64)
-	# AVX 512
-	CFLAGS += -mavx512f -mfma -mf16c
-
-	# AVX 256
-	#CFLAGS += -mavx -mavx2 -mfma -mf16c
+	ifeq ($(UNAME_S),Darwin)
+		CFLAGS += -mfma -mf16c
+		AVX1_M := $(shell sysctl machdep.cpu.features)
+		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
+			CFLAGS += -mavx
+		endif
+		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
+		ifneq (,$(findstring AVX2,$(AVX2_M)))
+			CFLAGS += -mavx2
+		endif
+	else ifeq ($(UNAME_S),Linux)
+		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
+		ifneq (,$(findstring avx,$(AVX1_M)))
+			CFLAGS += -mavx
+		endif
+		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
+		ifneq (,$(findstring avx2,$(AVX2_M)))
+			CFLAGS += -mavx2
+		endif
+		FMA_M := $(shell grep "fma " /proc/cpuinfo)
+		ifneq (,$(findstring fma,$(FMA_M)))
+			CFLAGS += -mfma
+		endif
+		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
+		ifneq (,$(findstring f16c,$(F16C_M)))
+			CFLAGS += -mf16c
+		endif
+	else
+		CFLAGS += -mfma -mf16c -mavx -mavx2
+	endif
 endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
@ -66,6 +91,14 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
+ifdef WHISPER_OPENBLAS
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	LDFLAGS += -lopenblas
+endif
+ifdef WHISPER_GPROF
+	CFLAGS  += -pg
+	CXXFLAGS  += -pg
+endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
@ -81,13 +114,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

-#
-# Build library + main
-#
+default: main

-main: examples/main/main.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp whisper.o ggml.o -o main $(LDFLAGS)
-	./main -h
+#
+# Build library
+#

 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
@ -98,8 +129,11 @@ whisper.o: whisper.cpp whisper.h
 libwhisper.a: ggml.o whisper.o
 	$(AR) rcs libwhisper.a ggml.o whisper.o

+libwhisper.so: ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
+
 clean:
-	rm -f *.o main stream bench libwhisper.a
+	rm -f *.o main stream command bench libwhisper.a libwhisper.so

 #
 # Examples
@ -107,9 +141,16 @@ clean:

 CC_SDL=`sdl2-config --cflags --libs`

+main: examples/main/main.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
+	./main -h
+
 stream: examples/stream/stream.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

+command: examples/command/command.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

@ -148,9 +189,10 @@ samples:
 .PHONY: small
 .PHONY: medium.en
 .PHONY: medium
+.PHONY: large-v1
 .PHONY: large

-tiny.en tiny base.en base small.en small medium.en medium large: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
@ -159,9 +201,17 @@ tiny.en tiny base.en base small.en small medium.en medium large: main
 	@echo ""
 	@for f in samples/*.wav; do \
 		echo "----------------------------------------------" ; \
-		echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
+		echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
 	    echo "----------------------------------------------" ; \
 		echo "" ; \
 		./main -m models/ggml-$@.bin -f $$f ; \
 		echo "" ; \
 	done
+
+#
+# Tests
+#
+
+.PHONY: tests
+tests:
+	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@ -20,8 +20,8 @@ Supported platforms:
 - [x] [iOS](examples/whisper.objc)
 - [x] Linux
 - [x] [WebAssembly](examples/whisper.wasm)
- [x] [Windows (MSVC and MinGW)](https://github.com/ggerganov/whisper.cpp/issues/5)
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/issues/7)
+- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
+- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
 - [x] [Android](https://github.com/ggerganov/whisper.cpp/issues/30)

 The entire implementation of the model is contained in 2 source files:
@ -30,10 +30,16 @@ The entire implementation of the model is contained in 2 source files:
 - Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)

 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
-As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device:
+As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)

 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4

+You can also easily make your own offline voice assistant application: [command](examples/command)
+
+https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
+
+Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
+
 ## Implementation details

 - The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
@ -94,27 +100,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,       --help           show this help message and exit
-  -s SEED,  --seed SEED      RNG seed (default: -1)
-  -t N,     --threads N      number of threads to use during computation (default: 4)
-  -p N,     --processors N   number of processors to use during computation (default: 1)
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
-  -on N,    --offset-n N     segment index offset (default: 0)
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
-  -v,       --verbose        verbose output
-            --translate      translate from source language to english
-  -otxt,    --output-txt     output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
-  -ps,      --print_special  print special tokens
-  -pc,      --print_colors   print colors
-  -nt,      --no_timestamps  do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
-  -f FNAME, --file FNAME     input WAV file path
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path

 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -146,13 +152,13 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
-whisper_model_load: mem_required  = 670.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 140.60 MB
-whisper_model_load: memory size =    22.83 MB
-whisper_model_load: model size  =   140.54 MB
+whisper_model_load: mem_required  =  506.00 MB
+whisper_model_load: ggml ctx size =  140.60 MB
+whisper_model_load: memory size   =   22.83 MB
+whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |

 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

@ -200,6 +206,7 @@ make small.en
 make small
 make medium.en
 make medium
+make large-v1
 make large
 ```

@ -211,7 +218,7 @@ make large
 | base   | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
 | small  | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
 | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

 ## Another example

@ -428,18 +435,43 @@ The original models are converted to a custom binary format. This allows to pack
 - vocabulary
 - weights

-You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script or from here:
+You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
+or manually from here:

-https://ggml.ggerganov.com
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://ggml.ggerganov.com

-For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README in [models](models).
+For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
+in [models](models).

 ## Bindings

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
+- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
 - [ ] Python:
 - [ ] Java:

 ## Examples

-There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
+There are various examples of using the library for different projects in the [examples](examples) folder.
+Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
+
+| Example | Web | Description |
+| ---     | --- | ---         |
+| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
+| [bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
+| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
+| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
+| | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
+| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
+| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
+| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
+| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
+
+## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
+
+If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
+You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
+to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
+[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/CMakeLists.txt
+++ b/bindings/javascript/CMakeLists.txt
@ -9,12 +9,13 @@ target_link_libraries(${TARGET} PRIVATE
    )

 unset(EXTRA_FLAGS)
+
 if (WHISPER_WASM_SINGLE_FILE)
    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
    message(STATUS "Embedding WASM inside whisper.js")

    add_custom_command(
-        TARGET libwhisper POST_BUILD
+        TARGET ${TARGET} POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E copy
        ${CMAKE_BINARY_DIR}/bin/libwhisper.js
        ${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -6,10 +6,16 @@
 #include <vector>
 #include <thread>

+std::thread g_worker;
+
 std::vector<struct whisper_context *> g_contexts(4, nullptr);

 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
                g_contexts[i] = whisper_init(path_model.c_str());
@ -25,6 +31,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));

    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        --index;

        if (index < g_contexts.size()) {
@ -34,6 +44,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
    }));

    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
        --index;

        if (index >= g_contexts.size()) {
@ -46,14 +60,14 @@ EMSCRIPTEN_BINDINGS(whisper) {

        struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);

-        params.print_realtime       = true;
-        params.print_progress       = false;
-        params.print_timestamps     = true;
-        params.print_special_tokens = false;
-        params.translate            = translate;
-        params.language             = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads            = std::min(8, (int) std::thread::hardware_concurrency());
-        params.offset_ms            = 0;
+        params.print_realtime   = true;
+        params.print_progress   = false;
+        params.print_timestamps = true;
+        params.print_special    = false;
+        params.translate        = translate;
+        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
+        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
+        params.offset_ms        = 0;

        std::vector<float> pcmf32;
        const int n = audio["length"].as<int>();
@ -80,10 +94,15 @@ EMSCRIPTEN_BINDINGS(whisper) {
            printf("\n");
        }

-        int ret = whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
+        // run the worker
+        {
+            g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
+                whisper_reset_timings(g_contexts[index]);
+                whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
+                whisper_print_timings(g_contexts[index]);
+            });
+        }

-        whisper_print_timings(g_contexts[index]);
-
-        return ret;
+        return 0;
    }));
 }
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -20,8 +20,12 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
+    add_subdirectory(stream.wasm)
+    add_subdirectory(command.wasm)
+    add_subdirectory(talk.wasm)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
+    add_subdirectory(command)
    add_subdirectory(bench)
 endif()
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -6,9 +6,9 @@

 // command-line parameters
 struct whisper_params {
-    int32_t n_threads   = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());

-    std::string model     = "models/ggml-base.en.bin";
+    std::string model = "models/ggml-base.en.bin";
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -17,14 +17,13 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
+        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
+        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -39,9 +38,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }

--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -0,0 +1,47 @@
+#
+# libcommand
+#
+
+set(TARGET libcommand)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside command.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libcommand.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# command.wasm
+#
+
+set(TARGET command.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/command.wasm/README.md
+++ b/examples/command.wasm/README.md
@ -0,0 +1,23 @@
+# command.wasm
+
+This is a basic Voice Assistant example that accepts voice commands from the microphone.
+It runs in fully in the browser via WebAseembly.
+
+Online demo: https://whisper.ggerganov.com/command/
+
+Terminal version: [examples/command](/examples/command)
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/command.wasm/*       /path/to/html/
+cp bin/libcommand.worker.js /path/to/html/
+```
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -0,0 +1,408 @@
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex  g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_transcribed   = "";
+
+std::vector<float> g_pcmf32;
+
+static std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+// compute similarity between two strings using Levenshtein distance
+static float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
+
+void command_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
+std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+    const auto t_start = std::chrono::high_resolution_clock::now();
+
+    prob = 0.0f;
+    t_ms = 0;
+
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        return "";
+    }
+
+    int prob_n = 0;
+    std::string result;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+
+        result += text;
+
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const auto token = whisper_full_get_token_data(ctx, i, j);
+
+            prob += token.p;
+            ++prob_n;
+        }
+    }
+
+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
+    const auto t_end = std::chrono::high_resolution_clock::now();
+    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+
+    return result;
+}
+
+void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
+    const int64_t n_samples = (ms * sample_rate) / 1000;
+
+    int64_t n_take = 0;
+    if (g_pcmf32.size() < n_samples) {
+        n_take = g_pcmf32.size();
+    } else {
+        n_take = n_samples;
+    }
+
+    audio.resize(n_take);
+    std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
+}
+
+void command_main(size_t index) {
+    command_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    printf("command: using %d threads\n", wparams.n_threads);
+
+    bool is_running   = true;
+    bool have_prompt  = false;
+    bool ask_prompt   = true;
+    bool print_energy = false;
+
+    float prob0 = 0.0f;
+    float prob  = 0.0f;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    const int32_t vad_ms     = 2000;
+    const int32_t prompt_ms  = 5000;
+    const int32_t command_ms = 4000;
+
+    const float vad_thold  = 0.1f;
+    const float freq_thold = -1.0f;
+
+    while (g_running) {
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        if (ask_prompt) {
+            fprintf(stdout, "\n");
+            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
+            fprintf(stdout, "\n");
+
+            {
+                char txt[1024];
+                snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
+                command_set_status(txt);
+            }
+
+            ask_prompt = false;
+        }
+
+        int64_t t_ms = 0;
+
+        {
+            command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+            if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
+                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+                command_set_status("Speech detected! Processing ...");
+
+                if (!have_prompt) {
+                    command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+                    const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+
+                    const float sim = similarity(txt, k_prompt);
+
+                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
+                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
+                        ask_prompt = true;
+                    } else {
+                        fprintf(stdout, "\n");
+                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
+                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
+                        fprintf(stdout, "\n");
+
+                        {
+                            char txt[1024];
+                            snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
+                            command_set_status(txt);
+                        }
+
+                        // save the audio for the prompt
+                        pcmf32_prompt = pcmf32_cur;
+                        have_prompt = true;
+                    }
+                } else {
+                    command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
+
+                    // prepend the prompt audio
+                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
+
+                    const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
+
+                    prob = 100.0f*(prob - prob0);
+
+                    fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
+
+                    // find the prompt in the text
+                    float best_sim = 0.0f;
+                    size_t best_len = 0;
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        const auto prompt = txt.substr(0, n);
+
+                        const float sim = similarity(prompt, k_prompt);
+
+                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
+
+                        if (sim > best_sim) {
+                            best_sim = sim;
+                            best_len = n;
+                        }
+                    }
+
+                    const std::string command = ::trim(txt.substr(best_len));
+
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    fprintf(stdout, "\n");
+
+                    {
+                        char txt[1024];
+                        snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
+                        command_set_status(txt);
+                    }
+                    {
+                        std::lock_guard<std::mutex> lock(g_mutex);
+                        g_transcribed = command;
+                    }
+                }
+
+                g_pcmf32.clear();
+            }
+        }
+    }
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(command) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        command_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("get_transcribed", emscripten::optional_override([]() {
+        std::string transcribed;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            transcribed = std::move(g_transcribed);
+        }
+
+        return transcribed;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+}
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@ -0,0 +1,386 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <title>command : Voice assistant example using Whisper + WebAssembly</title>
+
+        <style>
+            #output {
+                width: 100%;
+                height: 100%;
+                margin: 0 auto;
+                margin-top: 10px;
+                border-left: 0px;
+                border-right: 0px;
+                padding-left: 0px;
+                padding-right: 0px;
+                display: block;
+                background-color: black;
+                color: white;
+                font-size: 10px;
+                font-family: 'Lucida Console', Monaco, monospace;
+                outline: none;
+                white-space: pre;
+                overflow-wrap: normal;
+                overflow-x: scroll;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="main-container">
+            <b>command : Voice assistant example using Whisper + WebAssembly</b>
+
+            <br><br>
+
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
+
+            <br><br>
+
+            <hr>
+
+            Select the model you would like to use, click the "Start" button and follow the instructions.
+
+            <br><br>
+
+            <div id="model-whisper">
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <!--
+                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="input">
+                <button id="start" onclick="onStart()" disabled>Start</button>
+                <button id="stop"  onclick="onStop()" disabled>Stop</button>
+                <button id="clear" onclick="clearCache()">Clear Cache</button>
+            </div>
+
+            <br>
+
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>
+
+                <pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
+            </div>
+
+            <hr>
+
+            Debug output:
+            <textarea id="output" rows="20"></textarea>
+
+            <br>
+
+            <b>Troubleshooting</b>
+
+            <br><br>
+
+            The page does some heavy computations, so make sure:
+
+            <ul>
+                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+            </ul>
+
+            <div class="cell-version">
+                <span>
+                    |
+                    Build time: <span class="nav-link">@GIT_DATE@</span> |
+                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
+                </span>
+            </div>
+        </div>
+
+        <script type="text/javascript" src="helpers.js"></script>
+        <script type='text/javascript'>
+            // web audio context
+            var context = null;
+
+            // audio data
+            var audio = null;
+            var audio0 = null;
+
+            // the command instance
+            var instance = null;
+
+            // model name
+            var model_whisper = null;
+
+            var Module = {
+                print: printTextarea,
+                printErr: printTextarea,
+                setStatus: function(text) {
+                    printTextarea('js: ' + text);
+                },
+                monitorRunDependencies: function(left) {
+                },
+                preRun: function() {
+                    printTextarea('js: Preparing ...');
+                },
+                postRun: function() {
+                    printTextarea('js: Initialized successfully!');
+                }
+            };
+
+            //
+            // fetch models
+            //
+
+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                if (model_whisper != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop' ).disabled = true;
+                }
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'base.en': 142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            //
+            // microphone
+            //
+
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
+
+            var mediaRecorder = null;
+            var doRecording = false;
+            var startTime = 0;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+            function stopRecording() {
+                Module.set_status("paused");
+                doRecording = false;
+                audio0 = null;
+                audio = null;
+                context = null;
+            }
+
+            function startRecording() {
+                if (!context) {
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
+                }
+
+                Module.set_status("");
+
+                document.getElementById('start').disabled = true;
+                document.getElementById('stop').disabled = false;
+
+                doRecording = true;
+                startTime = Date.now();
+
+                var chunks = [];
+                var stream = null;
+
+                navigator.mediaDevices.getUserMedia({audio: true, video: false})
+                    .then(function(s) {
+                        stream = s;
+                        mediaRecorder = new MediaRecorder(stream);
+                        mediaRecorder.ondataavailable = function(e) {
+                            chunks.push(e.data);
+
+                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+                            var reader = new FileReader();
+
+                            reader.onload = function(event) {
+                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
+                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
+                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+                                    var source = offlineContext.createBufferSource();
+                                    source.buffer = audioBuffer;
+                                    source.connect(offlineContext.destination);
+                                    source.start(0);
+
+                                    offlineContext.startRendering().then(function(renderedBuffer) {
+                                        audio = renderedBuffer.getChannelData(0);
+
+                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+                                        if (audio0 != null) {
+                                            audioAll.set(audio0, 0);
+                                        }
+                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+                                        if (instance) {
+                                            Module.set_audio(instance, audioAll);
+                                        }
+                                    });
+                                }, function(e) {
+                                    audio = null;
+                                });
+                            }
+
+                            reader.readAsArrayBuffer(blob);
+                        };
+
+                        mediaRecorder.onstop = function(e) {
+                            if (doRecording) {
+                                setTimeout(function() {
+                                    startRecording();
+                                });
+                            }
+                        };
+
+                        mediaRecorder.start(kIntervalAudio_ms);
+                    })
+                    .catch(function(err) {
+                        printTextarea('js: error getting audio stream: ' + err);
+                    });
+
+                var interval = setInterval(function() {
+                    if (!doRecording) {
+                        clearInterval(interval);
+                        mediaRecorder.stop();
+                        stream.getTracks().forEach(function(track) {
+                            track.stop();
+                        });
+
+                        document.getElementById('start').disabled = false;
+                        document.getElementById('stop').disabled  = true;
+
+                        mediaRecorder = null;
+                    }
+
+                    // if audio length is more than kRestartRecording_s seconds, restart recording
+                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+                        if (doRecording) {
+                            //printTextarea('js: restarting recording');
+
+                            clearInterval(interval);
+                            audio0 = audio;
+                            audio = null;
+                            mediaRecorder.stop();
+                            stream.getTracks().forEach(function(track) {
+                                track.stop();
+                            });
+                        }
+                    }
+                }, 100);
+            }
+
+            //
+            // main
+            //
+
+            var nLines = 0;
+            var intervalUpdate = null;
+            var transcribedAll = '';
+
+            function onStart() {
+                if (!instance) {
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                }
+
+                if (!instance) {
+                    printTextarea("js: failed to initialize whisper");
+                    return;
+                }
+
+                startRecording();
+
+                intervalUpdate = setInterval(function() {
+                    var transcribed = Module.get_transcribed();
+
+                    if (transcribed != null && transcribed.length > 1) {
+                        transcribedAll += transcribed + '<br>';
+                        nLines++;
+
+                        // if more than 10 lines, remove the first line
+                        if (nLines > 10) {
+                            var i = transcribedAll.indexOf('<br>');
+                            if (i > 0) {
+                                transcribedAll = transcribedAll.substring(i + 4);
+                                nLines--;
+                            }
+                        }
+                    }
+
+                    document.getElementById('state-status').innerHTML = Module.get_status();
+                    document.getElementById('state-transcribed').innerHTML = transcribedAll;
+                }, 100);
+            }
+
+            function onStop() {
+                stopRecording();
+            }
+
+        </script>
+        <script type="text/javascript" src="command.js"></script>
+    </body>
+</html>
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -0,0 +1,7 @@
+if (WHISPER_SUPPORT_SDL2)
+    # command
+    set(TARGET command)
+    add_executable(${TARGET} command.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -0,0 +1,30 @@
+# command
+
+This is a basic Voice Assistant example that accepts voice commands from the microphone.
+More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
+
+```bash
+# Run with default arguments and small model
+./command -m ./models/ggml-small.en.bin -t 8
+
+# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
+./command -m ./models/ggml-tiny.en.bin -ac 768 -t 4 -c 0
+```
+
+https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
+
+Web version: [examples/command.wasm](/examples/command.wasm)
+
+## Building
+
+The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2 on Linux
+sudo apt-get install libsdl2-dev
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+make command
+```
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -0,0 +1,655 @@
+// Voice assistant example
+//
+// Speak short text commands to the microphone.
+// This program will detect your voice command and convert them to text.
+//
+// ref: https://github.com/ggerganov/whisper.cpp/issues/171
+//
+
+#include "whisper.h"
+
+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t prompt_ms  = 5000;
+    int32_t command_ms = 4000;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;
+
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
+    bool speed_up      = false;
+    bool translate     = false;
+    bool no_context    = true;
+    bool print_special = false;
+    bool print_energy  = false;
+    bool no_timestamps = true;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+    std::string fname_out = "";
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -pms N,   --prompt-ms N   [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
+    fprintf(stderr, "  -cms N,   --command-ms N  [%-7d] command duration in milliseconds\n",            params.command_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
+
+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    bool       m_running = false;
+    std::mutex m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+std::string trim(const std::string & s) {
+    std::regex e("^\\s+|\\s+$");
+    return std::regex_replace(s, e, "");
+}
+
+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (size_t i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
+std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
+    const auto t_start = std::chrono::high_resolution_clock::now();
+
+    prob = 0.0f;
+    t_ms = 0;
+
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+    wparams.print_progress   = false;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = !params.no_timestamps;
+    wparams.translate        = params.translate;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.max_tokens       = params.max_tokens;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = params.n_threads;
+
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;
+
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        return "";
+    }
+
+    int prob_n = 0;
+    std::string result;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+
+        result += text;
+
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const auto token = whisper_full_get_token_data(ctx, i, j);
+
+            prob += token.p;
+            ++prob_n;
+        }
+    }
+
+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
+    const auto t_end = std::chrono::high_resolution_clock::now();
+    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+
+    return result;
+}
+
+// compute similarity between two strings using Levenshtein distance
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init(params.model.c_str());
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        fprintf(stderr, "\n");
+    }
+
+
+    // init audio
+
+    audio_async audio(30*1000);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    bool is_running  = true;
+    bool have_prompt = false;
+    bool ask_prompt  = true;
+
+    float prob0 = 0.0f;
+    float prob  = 0.0f;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    const std::string k_prompt = "Ok Whisper, start listening for commands.";
+
+    // main loop
+    while (is_running) {
+        // handle Ctrl + C
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }
+
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        if (ask_prompt) {
+            fprintf(stdout, "\n");
+            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
+            fprintf(stdout, "\n");
+
+            ask_prompt = false;
+        }
+
+        int64_t t_ms = 0;
+
+        {
+            audio.get(2000, pcmf32_cur);
+
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+
+                if (!have_prompt) {
+                    audio.get(params.prompt_ms, pcmf32_cur);
+
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+
+                    const float sim = similarity(txt, k_prompt);
+
+                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
+                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
+                        ask_prompt = true;
+                    } else {
+                        fprintf(stdout, "\n");
+                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
+                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
+                        fprintf(stdout, "\n");
+
+                        // save the audio for the prompt
+                        pcmf32_prompt = pcmf32_cur;
+                        have_prompt = true;
+                    }
+                } else {
+                    audio.get(params.command_ms, pcmf32_cur);
+
+                    // prepend the prompt audio
+                    pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
+
+                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
+
+                    prob = 100.0f*(prob - prob0);
+
+                    //fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
+
+                    // find the prompt in the text
+                    float best_sim = 0.0f;
+                    size_t best_len = 0;
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        const auto prompt = txt.substr(0, n);
+
+                        const float sim = similarity(prompt, k_prompt);
+
+                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
+
+                        if (sim > best_sim) {
+                            best_sim = sim;
+                            best_len = n;
+                        }
+                    }
+
+                    const std::string command = ::trim(txt.substr(best_len));
+
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    fprintf(stdout, "\n");
+                }
+
+                audio.clear();
+            }
+        }
+    }
+
+    audio.pause();
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/generate-karaoke.sh
+++ b/examples/generate-karaoke.sh
@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Simple tool to record audio from the microphone and generate a karaoke video
+# Usage:
+#
+#  cd whisper.cpp
+#  make
+#
+#  ./examples/generate-karaoke.sh [model] [step_ms]
+#
+# Press Ctrl+C to stop recording
+#
+
+executable="./main"
+model="base.en"
+model_path="models/ggml-$model.bin"
+
+# require sox and ffmpeg to be installed
+if ! command -v sox &> /dev/null
+then
+    echo "sox could not be found"
+    exit 1
+fi
+
+if ! command -v ffmpeg &> /dev/null
+then
+    echo "ffmpeg could not be found"
+    exit 2
+fi
+
+if [ ! -f "$executable" ]; then
+    echo "'$executable' does not exist. Please build it first."
+    exit 3
+fi
+
+if [ ! -f "$model_path" ]; then
+    echo "'$model_path' does not exist. Please download it first."
+    exit 4
+fi
+
+# record some raw audio
+sox -d rec.wav
+
+# resample to 16kHz
+ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
+
+# run Whisper
+echo "Processing ..."
+./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
+
+# generate Karaoke video
+echo "Generating video ..."
+source rec16.wav.wts > /dev/null 2>&1
+
+# play the video
+echo "Playing ./rec16.wav.mp4 ..."
+ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
+
+echo "Done"
+exit 0
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -0,0 +1,182 @@
+// Common Javascript functions used by the examples
+
+function convertTypedArray(src, type) {
+    var buffer = new ArrayBuffer(src.byteLength);
+    var baseView = new src.constructor(buffer).set(src);
+    return new type(buffer);
+}
+
+var printTextarea = (function() {
+    var element = document.getElementById('output');
+    if (element) element.alue = ''; // clear browser cache
+    return function(text) {
+        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
+        console.log(text);
+        if (element) {
+            element.value += text + "\n";
+            element.scrollTop = element.scrollHeight; // focus on bottom
+        }
+    };
+})();
+
+async function clearCache() {
+    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
+        indexedDB.deleteDatabase(dbName);
+    }
+}
+
+// fetch a remote file from remote URL using the Fetch API
+async function fetchRemote(url, cbProgress, cbPrint) {
+    cbPrint('fetchRemote: downloading with fetch()...');
+
+    const response = await fetch(
+        url,
+        {
+            method: 'GET',
+            headers: {
+                'Content-Type': 'application/octet-stream',
+            },
+        }
+    );
+
+    if (!response.ok) {
+        cbPrint('fetchRemote: failed to fetch ' + url);
+        return;
+    }
+
+    const contentLength = response.headers.get('content-length');
+    const total = parseInt(contentLength, 10);
+    const reader = response.body.getReader();
+
+    var chunks = [];
+    var receivedLength = 0;
+    var progressLast = -1;
+
+    while (true) {
+        const { done, value } = await reader.read();
+
+        if (done) {
+            break;
+        }
+
+        chunks.push(value);
+        receivedLength += value.length;
+
+        if (contentLength) {
+            cbProgress(receivedLength/total);
+
+            var progressCur = Math.round((receivedLength / total) * 10);
+            if (progressCur != progressLast) {
+                cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
+                progressLast = progressCur;
+            }
+        }
+    }
+
+    var position = 0;
+    var chunksAll = new Uint8Array(receivedLength);
+
+    for (var chunk of chunks) {
+        chunksAll.set(chunk, position);
+        position += chunk.length;
+    }
+
+    return chunksAll;
+}
+
+// load remote data
+// - check if the data is already in the IndexedDB
+// - if not, fetch it from the remote URL and store it in the IndexedDB
+function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
+    // query the storage quota and print it
+    navigator.storage.estimate().then(function (estimate) {
+        cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
+        cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
+    });
+
+    // check if the data is already in the IndexedDB
+    var rq = indexedDB.open(dbName, dbVersion);
+
+    rq.onupgradeneeded = function (event) {
+        var db = event.target.result;
+        if (db.version == 1) {
+            var os = db.createObjectStore('models', { autoIncrement: false });
+            cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
+        } else {
+            // clear the database
+            var os = event.currentTarget.transaction.objectStore('models');
+            os.clear();
+            cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
+        }
+    };
+
+    rq.onsuccess = function (event) {
+        var db = event.target.result;
+        var tx = db.transaction(['models'], 'readonly');
+        var os = tx.objectStore('models');
+        var rq = os.get(url);
+
+        rq.onsuccess = function (event) {
+            if (rq.result) {
+                cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
+                cbReady(dst, rq.result);
+            } else {
+                // data is not in the IndexedDB
+                cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
+
+                // alert and ask the user to confirm
+                if (!confirm(
+                    'You are about to download ' + size_mb + ' MB of data.\n' +
+                    'The model data will be cached in the browser for future use.\n\n' +
+                    'Press OK to continue.')) {
+                    cbCancel();
+                    return;
+                }
+
+                fetchRemote(url, cbProgress, cbPrint).then(function (data) {
+                    if (data) {
+                        // store the data in the IndexedDB
+                        var rq = indexedDB.open(dbName, dbVersion);
+                        rq.onsuccess = function (event) {
+                            var db = event.target.result;
+                            var tx = db.transaction(['models'], 'readwrite');
+                            var os = tx.objectStore('models');
+                            var rq = os.put(data, url);
+
+                            rq.onsuccess = function (event) {
+                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
+                                cbReady(dst, data);
+                            };
+
+                            rq.onerror = function (event) {
+                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
+                                cbCancel();
+                            };
+                        };
+                    }
+                });
+            }
+        };
+
+        rq.onerror = function (event) {
+            cbPrint('loadRemote: failed to get data from the IndexedDB');
+            cbCancel();
+        };
+    };
+
+    rq.onerror = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB');
+        cbCancel();
+    };
+
+    rq.onblocked = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB: blocked');
+        cbCancel();
+    };
+
+    rq.onabort = function (event) {
+        cbPrint('loadRemote: failed to open IndexedDB: abort');
+
+    };
+}
+
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -0,0 +1,98 @@
+#!/bin/bash
+set -eo pipefail
+# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
+# Idea by @semiformal-net
+# ref: https://github.com/ggerganov/whisper.cpp/issues/185
+#
+# TODO:
+# - Currently, there is a gap between sequential chunks, so some of the words are dropped. Need to figure out a
+#   way to produce a continuous stream of audio chunks.
+#
+
+url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
+fmt=aac # the audio format extension of the stream (TODO: auto detect)
+step_s=30
+model="base.en"
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 stream_url [step_s] [model]"
+    echo ""
+    echo "  Example:"
+    echo "    $0 $url $step_s $model"
+    echo ""
+    echo "No url specified, using default: $url"
+else
+    url="$1"
+fi
+
+if [ -n "$2" ]; then
+    step_s="$2"
+fi
+
+if [ -n "$3" ]; then
+    model="$3"
+fi
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+running=1
+
+trap "running=0" SIGINT SIGTERM
+
+printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
+
+# continuous stream in native fmt (this file will grow forever!)
+ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
+if [ $? -ne 0 ]; then
+    printf "Error: ffmpeg failed to capture audio stream\n"
+    exit 1
+fi
+
+printf "Buffering audio. Please wait...\n\n"
+sleep $(($step_s))
+
+# do not stop script on error
+set +e
+
+i=0
+SECONDS=0
+while [ $running -eq 1 ]; do
+    # extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
+    err=1
+    while [ $err -ne 0 ]; do
+        if [ $i -gt 0 ]; then
+            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
+        else
+            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
+        fi
+        err=$(cat /tmp/whisper-live.err | wc -l)
+    done
+
+    ./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
+
+    while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
+        sleep 1
+    done
+    ((i=i+1))
+done
+
+killall -v ffmpeg
+killall -v main
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -6,29 +6,28 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 ```
 ./main -h

-usage: ./bin/main [options] file0.wav file1.wav ...
-
-  -h,       --help           show this help message and exit
-  -s SEED,  --seed SEED      RNG seed (default: -1)
-  -t N,     --threads N      number of threads to use during computation (default: 4)
-  -p N,     --processors N   number of processors to use during computation (default: 1)
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
-  -on N,    --offset-n N     segment index offset (default: 0)
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
-  -v,       --verbose        verbose output
-            --translate      translate from source language to english
-  -otxt,    --output-txt     output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
-  -ps,      --print_special  print special tokens
-  -pc,      --print_colors   print colors
-  -nt,      --no_timestamps  do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
-  -f FNAME, --file FNAME     input WAV file path
-  -h,       --help           show this help message and exit
+usage: ./main [options] file0.wav file1.wav ...

+options:
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -36,6 +36,10 @@ std::string to_timestamp(int64_t t, bool comma = false) {
    return std::string(buf);
 }

+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
 // helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
@ -48,25 +52,26 @@ void replace_all(std::string & s, const std::string & search, const std::string

 // command-line parameters
 struct whisper_params {
-    int32_t seed         = -1; // RNG seed, not used currently
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
    int32_t offset_n     = 0;
+    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
    int32_t max_len      = 0;

    float word_thold = 0.01f;

-    bool verbose              = false;
-    bool translate            = false;
-    bool output_txt           = false;
-    bool output_vtt           = false;
-    bool output_srt           = false;
-    bool output_wts           = false;
-    bool print_special_tokens = false;
-    bool print_colors         = false;
-    bool no_timestamps        = false;
+    bool speed_up      = false;
+    bool translate     = false;
+    bool diarize       = false;
+    bool output_txt    = false;
+    bool output_vtt    = false;
+    bool output_srt    = false;
+    bool output_wts    = false;
+    bool print_special = false;
+    bool print_colors  = false;
+    bool no_timestamps = false;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -85,55 +90,32 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            continue;
        }

-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--processors") {
-            params.n_processors = std::stoi(argv[++i]);
-        } else if (arg == "-ot" || arg == "--offset-t") {
-            params.offset_t_ms = std::stoi(argv[++i]);
-        } else if (arg == "-on" || arg == "--offset-n") {
-            params.offset_n = std::stoi(argv[++i]);
-        } else if (arg == "-mc" || arg == "--max-context") {
-            params.max_context = std::stoi(argv[++i]);
-        } else if (arg == "-ml" || arg == "--max-len") {
-            params.max_len = std::stoi(argv[++i]);
-        } else if (arg == "-wt" || arg == "--word-thold") {
-            params.word_thold = std::stof(argv[++i]);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--translate") {
-            params.translate = true;
-        } else if (arg == "-l" || arg == "--language") {
-            params.language = argv[++i];
-            if (whisper_lang_id(params.language.c_str()) == -1) {
-                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-                whisper_print_usage(argc, argv, params);
-                exit(0);
-            }
-        } else if (arg == "-otxt" || arg == "--output-txt") {
-            params.output_txt = true;
-        } else if (arg == "-ovtt" || arg == "--output-vtt") {
-            params.output_vtt = true;
-        } else if (arg == "-osrt" || arg == "--output-srt") {
-            params.output_srt = true;
-        } else if (arg == "-owts" || arg == "--output-words") {
-            params.output_wts = true;
-        } else if (arg == "-ps" || arg == "--print_special") {
-            params.print_special_tokens = true;
-        } else if (arg == "-pc" || arg == "--print_colors") {
-            params.print_colors = true;
-        } else if (arg == "-nt" || arg == "--no_timestamps") {
-            params.no_timestamps = true;
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-f" || arg == "--file") {
-            params.fname_inp.push_back(argv[++i]);
-        } else if (arg == "-h" || arg == "--help") {
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
+        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-p"    || arg == "--processors")    { params.n_processors  = std::stoi(argv[++i]); }
+        else if (arg == "-ot"   || arg == "--offset-t")      { params.offset_t_ms   = std::stoi(argv[++i]); }
+        else if (arg == "-on"   || arg == "--offset-n")      { params.offset_n      = std::stoi(argv[++i]); }
+        else if (arg == "-d"    || arg == "--duration")      { params.duration_ms   = std::stoi(argv[++i]); }
+        else if (arg == "-mc"   || arg == "--max-context")   { params.max_context   = std::stoi(argv[++i]); }
+        else if (arg == "-ml"   || arg == "--max-len")       { params.max_len       = std::stoi(argv[++i]); }
+        else if (arg == "-wt"   || arg == "--word-thold")    { params.word_thold    = std::stof(argv[++i]); }
+        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-di"   || arg == "--diarize")       { params.diarize       = true; }
+        else if (arg == "-otxt" || arg == "--output-txt")    { params.output_txt    = true; }
+        else if (arg == "-ovtt" || arg == "--output-vtt")    { params.output_vtt    = true; }
+        else if (arg == "-osrt" || arg == "--output-srt")    { params.output_srt    = true; }
+        else if (arg == "-owts" || arg == "--output-words")  { params.output_wts    = true; }
+        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")  { params.print_colors  = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps") { params.no_timestamps = true; }
+        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"    || arg == "--file")          { params.fname_inp.push_back(argv[++i]); }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -148,32 +130,40 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
-    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
-    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
-    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
-    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
-    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
-    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
-    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
-    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
-    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
-    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
-    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N  [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N    [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N    [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N    [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -otxt,    --output-txt    [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,    --output-vtt    [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,    --output-srt    [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -owts,    --output-words  [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,      --print-colors  [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }

+struct whisper_print_user_data {
+    const whisper_params * params;
+
+    const std::vector<std::vector<float>> * pcmf32s;
+};
+
 void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
-    const whisper_params & params = *(whisper_params *) user_data;
+    const auto & params  = *((whisper_print_user_data *) user_data)->params;
+    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

    const int n_segments = whisper_full_n_segments(ctx);

@ -187,7 +177,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
        if (params.no_timestamps) {
            if (params.print_colors) {
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -210,10 +200,37 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

+            std::string speaker = "";
+
+            if (params.diarize && pcmf32s.size() == 2) {
+                const int64_t n_samples = pcmf32s[0].size();
+
+                const int64_t is0 = timestamp_to_sample(t0, n_samples);
+                const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+                double energy0 = 0.0f;
+                double energy1 = 0.0f;
+
+                for (int64_t j = is0; j < is1; j++) {
+                    energy0 += fabs(pcmf32s[0][j]);
+                    energy1 += fabs(pcmf32s[1][j]);
+                }
+
+                if (energy0 > 1.1*energy1) {
+                    speaker = "(speaker 0)";
+                } else if (energy1 > 1.1*energy0) {
+                    speaker = "(speaker 1)";
+                } else {
+                    speaker = "(speaker ?)";
+                }
+
+                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
+            }
+
            if (params.print_colors) {
                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special_tokens == false) {
+                    if (params.print_special == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
@ -225,13 +242,13 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);

-                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
            }
        }
    }
@ -259,7 +276,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return 9;
+        return false;
    }

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -382,9 +399,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                    ncnt += txt.size();
                }

-                ::replace_all(txt_bg, "'", "’");
+                ::replace_all(txt_bg, "'", "\u2019");
                ::replace_all(txt_bg, "\"", "\\\"");
-                ::replace_all(txt_fg, "'", "’");
+                ::replace_all(txt_fg, "'", "\u2019");
                ::replace_all(txt_fg, "\"", "\\\"");
            }

@ -424,16 +441,18 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 2;
    }

+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
    // whisper init

    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -446,32 +465,60 @@ int main(int argc, char ** argv) {
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];

+        std::vector<float> pcmf32; // mono-channel F32 PCM
+        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
        // WAV input
-        std::vector<float> pcmf32;
        {
            drwav wav;
-            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
-                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
-                whisper_print_usage(argc, argv, {});
-                return 4;
+            std::vector<uint8_t> wav_data; // used for pipe input from stdin
+
+            if (fname_inp == "-") {
+                {
+                    uint8_t buf[1024];
+                    while (true)
+                    {
+                        const size_t n = fread(buf, 1, sizeof(buf), stdin);
+                        if (n == 0) {
+                            break;
+                        }
+                        wav_data.insert(wav_data.end(), buf, buf + n);
+                    }
+                }
+
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false) {
+                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
+                    return 4;
+                }
+
+                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
+            }
+            else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
+                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
+                return 5;
            }

            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
-                return 5;
+                return 6;
+            }
+
+            if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
+                fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
+                return 6;
            }

            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
-                return 6;
+                return 8;
            }

            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
-                return 7;
+                return 9;
            }

-            int n = wav.totalPCMFrameCount;
+            const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);

            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
@ -489,6 +536,18 @@ int main(int argc, char ** argv) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
+
+            if (params.diarize) {
+                // convert to stereo, float
+                pcmf32s.resize(2);
+
+                pcmf32s[0].resize(n);
+                pcmf32s[1].resize(n);
+                for (int i = 0; i < n; i++) {
+                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
+                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
+                }
+            }
        }

        // print system information
@ -523,29 +582,47 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            wparams.print_realtime       = false;
-            wparams.print_progress       = false;
-            wparams.print_timestamps     = !params.no_timestamps;
-            wparams.print_special_tokens = params.print_special_tokens;
-            wparams.translate            = params.translate;
-            wparams.language             = params.language.c_str();
-            wparams.n_threads            = params.n_threads;
-            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms            = params.offset_t_ms;
+            wparams.print_realtime   = false;
+            wparams.print_progress   = false;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.print_special    = params.print_special;
+            wparams.translate        = params.translate;
+            wparams.language         = params.language.c_str();
+            wparams.n_threads        = params.n_threads;
+            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
+            wparams.offset_ms        = params.offset_t_ms;
+            wparams.duration_ms      = params.duration_ms;

-            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
-            wparams.thold_pt             = params.word_thold;
-            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+            wparams.token_timestamps = params.output_wts || params.max_len > 0;
+            wparams.thold_pt         = params.word_thold;
+            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+
+            wparams.speed_up         = params.speed_up;
+
+            whisper_print_user_data user_data = { &params, &pcmf32s };

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &params;
+                wparams.new_segment_callback_user_data = &user_data;
+            }
+
+            // example for abort mechanism
+            // in this example, we do not abort the processing, but we could if the flag is set to true
+            // the callback is called before every encoder run - if it returns false, the processing is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return !is_aborted;
+                };
+                wparams.encoder_begin_callback_user_data = &is_aborted;
            }

            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 8;
+                return 10;
            }
        }

--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -0,0 +1,47 @@
+#
+# libstream
+#
+
+set(TARGET libstream)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside stream.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libstream.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# stream.wasm
+#
+
+set(TARGET stream.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/stream.wasm/README.md
+++ b/examples/stream.wasm/README.md
@ -0,0 +1,20 @@
+# stream.wasm
+
+Real-time transcription in the browser using WebAssembly
+
+Online demo: https://whisper.ggerganov.com/stream/
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/stream.wasm/*       /path/to/html/
+cp bin/libstream.worker.js /path/to/html/
+```
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -0,0 +1,213 @@
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_transcribed   = "";
+
+std::vector<float> g_pcmf32;
+
+void stream_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+void stream_main(size_t index) {
+    stream_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    printf("stream: using %d threads\n", wparams.n_threads);
+
+    std::vector<float> pcmf32;
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    // 5 seconds interval
+    const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
+
+    while (g_running) {
+        stream_set_status("waiting for audio ...");
+
+        {
+            std::unique_lock<std::mutex> lock(g_mutex);
+
+            if (g_pcmf32.size() < 1024) {
+                lock.unlock();
+
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+                continue;
+            }
+
+            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
+            g_pcmf32.clear();
+        }
+
+        {
+            const auto t_start = std::chrono::high_resolution_clock::now();
+
+            stream_set_status("running whisper ...");
+
+            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+            if (ret != 0) {
+                printf("whisper_full() failed: %d\n", ret);
+                break;
+            }
+
+            const auto t_end = std::chrono::high_resolution_clock::now();
+
+            printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
+        }
+
+        {
+            std::string text_heard;
+
+            {
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                    printf("transcribed: %s\n", text);
+
+                    text_heard += text;
+                }
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                g_transcribed = text_heard;
+            }
+        }
+    }
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(stream) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        stream_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("get_transcribed", emscripten::optional_override([]() {
+        std::string transcribed;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            transcribed = std::move(g_transcribed);
+        }
+
+        return transcribed;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+}
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -0,0 +1,386 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <title>stream : Real-time Whisper transcription in WebAssembly</title>
+
+        <style>
+            #output {
+                width: 100%;
+                height: 100%;
+                margin: 0 auto;
+                margin-top: 10px;
+                border-left: 0px;
+                border-right: 0px;
+                padding-left: 0px;
+                padding-right: 0px;
+                display: block;
+                background-color: black;
+                color: white;
+                font-size: 10px;
+                font-family: 'Lucida Console', Monaco, monospace;
+                outline: none;
+                white-space: pre;
+                overflow-wrap: normal;
+                overflow-x: scroll;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="main-container">
+            <b>stream : Real-time Whisper transcription in WebAssembly</b>
+
+            <br><br>
+
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
+
+            <br><br>
+
+            <hr>
+
+            Select the model you would like to use, click the "Start" button and start speaking
+
+            <br><br>
+
+            <div id="model-whisper">
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <!--
+                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="input">
+                <button id="start"  onclick="onStart()" disabled>Start</button>
+                <button id="stop"   onclick="onStop()" disabled>Stop</button>
+                <button id="clear"  onclick="clearCache()">Clear Cache</button>
+            </div>
+
+            <br>
+
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>
+
+                <pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
+            </div>
+
+            <hr>
+
+            Debug output:
+            <textarea id="output" rows="20"></textarea>
+
+            <br>
+
+            <b>Troubleshooting</b>
+
+            <br><br>
+
+            The page does some heavy computations, so make sure:
+
+            <ul>
+                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+            </ul>
+
+            <div class="cell-version">
+                <span>
+                    |
+                    Build time: <span class="nav-link">@GIT_DATE@</span> |
+                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
+                </span>
+            </div>
+        </div>
+
+        <script type="text/javascript" src="helpers.js"></script>
+        <script type='text/javascript'>
+            // web audio context
+            var context = null;
+
+            // audio data
+            var audio = null;
+            var audio0 = null;
+
+            // the stream instance
+            var instance = null;
+
+            // model name
+            var model_whisper = null;
+
+            var Module = {
+                print: printTextarea,
+                printErr: printTextarea,
+                setStatus: function(text) {
+                    printTextarea('js: ' + text);
+                },
+                monitorRunDependencies: function(left) {
+                },
+                preRun: function() {
+                    printTextarea('js: Preparing ...');
+                },
+                postRun: function() {
+                    printTextarea('js: Initialized successfully!');
+                }
+            };
+
+            //
+            // fetch models
+            //
+
+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                if (model_whisper != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop' ).disabled = true;
+                }
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'base.en': 142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            //
+            // microphone
+            //
+
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
+
+            var mediaRecorder = null;
+            var doRecording = false;
+            var startTime = 0;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+            function stopRecording() {
+                Module.set_status("paused");
+                doRecording = false;
+                audio0 = null;
+                audio = null;
+                context = null;
+            }
+
+            function startRecording() {
+                if (!context) {
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
+                }
+
+                Module.set_status("");
+
+                document.getElementById('start').disabled = true;
+                document.getElementById('stop').disabled = false;
+
+                doRecording = true;
+                startTime = Date.now();
+
+                var chunks = [];
+                var stream = null;
+
+                navigator.mediaDevices.getUserMedia({audio: true, video: false})
+                    .then(function(s) {
+                        stream = s;
+                        mediaRecorder = new MediaRecorder(stream);
+                        mediaRecorder.ondataavailable = function(e) {
+                            chunks.push(e.data);
+
+                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+                            var reader = new FileReader();
+
+                            reader.onload = function(event) {
+                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
+                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
+                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+                                    var source = offlineContext.createBufferSource();
+                                    source.buffer = audioBuffer;
+                                    source.connect(offlineContext.destination);
+                                    source.start(0);
+
+                                    offlineContext.startRendering().then(function(renderedBuffer) {
+                                        audio = renderedBuffer.getChannelData(0);
+
+                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+                                        if (audio0 != null) {
+                                            audioAll.set(audio0, 0);
+                                        }
+                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+                                        if (instance) {
+                                            Module.set_audio(instance, audioAll);
+                                        }
+                                    });
+                                }, function(e) {
+                                    audio = null;
+                                });
+                            }
+
+                            reader.readAsArrayBuffer(blob);
+                        };
+
+                        mediaRecorder.onstop = function(e) {
+                            if (doRecording) {
+                                setTimeout(function() {
+                                    startRecording();
+                                });
+                            }
+                        };
+
+                        mediaRecorder.start(kIntervalAudio_ms);
+                    })
+                    .catch(function(err) {
+                        printTextarea('js: error getting audio stream: ' + err);
+                    });
+
+                var interval = setInterval(function() {
+                    if (!doRecording) {
+                        clearInterval(interval);
+                        mediaRecorder.stop();
+                        stream.getTracks().forEach(function(track) {
+                            track.stop();
+                        });
+
+                        document.getElementById('start').disabled = false;
+                        document.getElementById('stop').disabled  = true;
+
+                        mediaRecorder = null;
+                    }
+
+                    // if audio length is more than kRestartRecording_s seconds, restart recording
+                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+                        if (doRecording) {
+                            //printTextarea('js: restarting recording');
+
+                            clearInterval(interval);
+                            audio0 = audio;
+                            audio = null;
+                            mediaRecorder.stop();
+                            stream.getTracks().forEach(function(track) {
+                                track.stop();
+                            });
+                        }
+                    }
+                }, 100);
+            }
+
+            //
+            // main
+            //
+
+            var nLines = 0;
+            var intervalUpdate = null;
+            var transcribedAll = '';
+
+            function onStart() {
+                if (!instance) {
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                }
+
+                if (!instance) {
+                    printTextarea("js: failed to initialize whisper");
+                    return;
+                }
+
+                startRecording();
+
+                intervalUpdate = setInterval(function() {
+                    var transcribed = Module.get_transcribed();
+
+                    if (transcribed != null && transcribed.length > 1) {
+                        transcribedAll += transcribed + '<br>';
+                        nLines++;
+
+                        // if more than 10 lines, remove the first line
+                        if (nLines > 10) {
+                            var i = transcribedAll.indexOf('<br>');
+                            if (i > 0) {
+                                transcribedAll = transcribedAll.substring(i + 4);
+                                nLines--;
+                            }
+                        }
+                    }
+
+                    document.getElementById('state-status').innerHTML = Module.get_status();
+                    document.getElementById('state-transcribed').innerHTML = transcribedAll;
+                }, 100);
+            }
+
+            function onStop() {
+                stopRecording();
+            }
+
+        </script>
+        <script type="text/javascript" src="stream.js"></script>
+    </body>
+</html>
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -21,3 +21,7 @@ brew install sdl2

 make stream
 ```
+
+## Web version
+
+This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -4,11 +4,6 @@

 #include "whisper.h"

-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
 #include <SDL.h>
 #include <SDL_audio.h>

@ -35,17 +30,18 @@ std::string to_timestamp(int64_t t) {

 // command-line parameters
 struct whisper_params {
-    int32_t seed       = -1; // RNG seed, not used currently
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;

-    bool verbose              = false;
-    bool translate            = false;
-    bool no_context           = true;
-    bool print_special_tokens = false;
-    bool no_timestamps        = true;
+    bool speed_up      = false;
+    bool translate     = false;
+    bool no_context    = true;
+    bool print_special = false;
+    bool no_timestamps = true;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -58,41 +54,24 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "--step") {
-            params.step_ms = std::stoi(argv[++i]);
-        } else if (arg == "--length") {
-            params.length_ms = std::stoi(argv[++i]);
-        } else if (arg == "-c" || arg == "--capture") {
-            params.capture_id = std::stoi(argv[++i]);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--translate") {
-            params.translate = true;
-        } else if (arg == "-kc" || arg == "--keep-context") {
-            params.no_context = false;
-        } else if (arg == "-l" || arg == "--language") {
-            params.language = argv[++i];
-            if (whisper_lang_id(params.language.c_str()) == -1) {
-                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-                whisper_print_usage(argc, argv, params);
-                exit(0);
-            }
-        } else if (arg == "-ps" || arg == "--print_special") {
-            params.print_special_tokens = true;
-        } else if (arg == "-nt" || arg == "--no_timestamps") {
-            params.no_timestamps = true;
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-f" || arg == "--file") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
+        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
-        } else {
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -107,20 +86,20 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
-    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
-    fprintf(stderr, "            --length N       audio length in milliseconds (default: %d)\n", params.length_ms);
-    fprintf(stderr, "  -c ID,    --capture ID     capture device ID (default: -1)\n");
-    fprintf(stderr, "  -v,       --verbose        verbose output\n");
-    fprintf(stderr, "            --translate      translate from source language to english\n");
-    fprintf(stderr, "  -kc,      --keep-context   keep text context from earlier audio (default: false)\n");
-    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
-    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
-    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME     text output file name (default: no output to file)\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
+    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }

@ -136,56 +115,51 @@ bool audio_sdl_init(const int capture_id) {
        return false;
    }

-    if (g_dev_id_in == 0) {
-        SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);

-        if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-            return (1);
-        }
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return (1);
+    }

-        SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);

-        {
-            int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-            fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-            for (int i = 0; i < nDevices; i++) {
-                fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-            }
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
        }
    }

-    if (g_dev_id_in == 0) {
-        SDL_AudioSpec capture_spec_requested;
-        SDL_AudioSpec capture_spec_obtained;
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;

-        SDL_zero(capture_spec_requested);
-        SDL_zero(capture_spec_obtained);
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);

-        capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
-        capture_spec_requested.format   = AUDIO_F32;
-        capture_spec_requested.channels = 1;
-        capture_spec_requested.samples  = 1024;
+    capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;

-        if (capture_id >= 0) {
-            fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-            g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-        } else {
-            fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-            g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-        }
-        if (!g_dev_id_in) {
-            fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-            g_dev_id_in = 0;
-        } else {
-            fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
-            fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
-            fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
-            fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
-            fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
-        }
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+    if (!g_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        g_dev_id_in = 0;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
    }
-

    return true;
 }
@ -199,10 +173,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
    // init audio

    if (!audio_sdl_init(params.capture_id)) {
@ -210,6 +180,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
    // whisper init

    struct whisper_context * ctx = whisper_init(params.model.c_str());
@ -217,10 +193,12 @@ int main(int argc, char ** argv) {
    const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
    const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
    const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = 0.2*WHISPER_SAMPLE_RATE;

    std::vector<float> pcmf32(n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;

+    std::vector<whisper_token> prompt_tokens;
    const int n_new_line = params.length_ms / params.step_ms - 1;

    // print some info about the processing
@ -266,16 +244,22 @@ int main(int argc, char ** argv) {

    // main audio loop
    while (is_running) {
-        // process SDL events:
-        SDL_Event event;
-        while (SDL_PollEvent(&event)) {
-            switch (event.type) {
-                case SDL_QUIT:
-                    {
-                        is_running = false;
-                    } break;
-                default:
-                    break;
+        // handle Ctrl + C
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
            }
        }

@ -299,7 +283,7 @@ int main(int argc, char ** argv) {
        //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));

        // take up to params.length_ms audio from previous iteration
-        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
+        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));

        //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());

@ -317,14 +301,22 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            wparams.print_progress       = false;
-            wparams.print_special_tokens = params.print_special_tokens;
-            wparams.print_realtime       = false;
-            wparams.print_timestamps     = !params.no_timestamps;
-            wparams.translate            = params.translate;
-            wparams.no_context           = params.no_context;
-            wparams.language             = params.language.c_str();
-            wparams.n_threads            = params.n_threads;
+            wparams.print_progress   = false;
+            wparams.print_special    = params.print_special;
+            wparams.print_realtime   = false;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.translate        = params.translate;
+            wparams.no_context       = true;
+            wparams.single_segment   = true;
+            wparams.max_tokens       = params.max_tokens;
+            wparams.language         = params.language.c_str();
+            wparams.n_threads        = params.n_threads;
+
+            wparams.audio_ctx        = params.audio_ctx;
+            wparams.speed_up         = params.speed_up;
+
+            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();

            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
@ -373,11 +365,29 @@ int main(int argc, char ** argv) {
            if ((n_iter % n_new_line) == 0) {
                printf("\n");

-                pcmf32_old.clear();
+                // keep part of the audio for next iteration to try to mitigate word boundary issues
+                pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
+
+                // Add tokens of the last full length segment as the prompt
+                if (!params.no_context) {
+                    prompt_tokens.clear();
+
+                    const int n_segments = whisper_full_n_segments(ctx);
+                    for (int i = 0; i < n_segments; ++i) {
+                        const int token_count = whisper_full_n_tokens(ctx, i);
+                        for (int j = 0; j < token_count; ++j) {
+                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
+                        }
+                    }
+                }
            }
        }
    }

+    if (g_dev_id_in >= 0) {
+        SDL_CloseAudioDevice(g_dev_id_in);
+    }
+
    whisper_print_timings(ctx);
    whisper_free(ctx);

--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -0,0 +1,48 @@
+#
+# libtalk
+#
+
+set(TARGET libtalk)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    gpt-2.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside talk.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libtalk.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1600MB \
+    -s TOTAL_MEMORY=1600MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# talk.wasm
+#
+
+set(TARGET talk.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -0,0 +1,72 @@
+# talk.wasm
+
+Talk with an Artificial Intelligence in your browser:
+
+[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
+
+Online demo: https://whisper.ggerganov.com/talk/
+
+## How it works?
+
+This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
+
+- [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
+- Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
+- The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
+
+The web page does the processing locally on your machine. The processing of these heavy neural network models in the
+browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
+extra performance:
+
+- The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
+- The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
+- Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
+- The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
+- The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
+
+In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
+is then cached in your browser's cache and can be reused in future visits without downloading it again.
+
+## Requirements
+
+In order to run this demo efficiently, you need to have the following:
+
+- Latest Chrome or Firefox browser (Safari is not supported)
+- Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
+- Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
+- The web-page uses about 1.6GB of RAM
+
+Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
+Also, the prompting strategy can likely be improved to achieve better results.
+
+The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
+browser. Typically, they run on powerful GPUs.
+
+Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
+on a phone or a tablet. Hopefully, in the near future this will become supported.
+
+## Todo
+
+- Better UI (contributions are welcome)
+- Better GPT-2 prompting
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/talk.wasm/*       /path/to/html/
+cp bin/libtalk.worker.js /path/to/html/
+```
+
+## Feedback
+
+If you have any comments or ideas for improvement, please drop a comment in the following discussion:
+
+https://github.com/ggerganov/whisper.cpp/discussions/167
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -0,0 +1,380 @@
+#include "ggml.h"
+#include "gpt-2.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+
+constexpr int N_THREAD = 8;
+
+struct gpt2_context * g_gpt2;
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex g_mutex;
+std::thread g_worker;
+std::atomic<bool> g_running(false);
+
+bool g_force_speak = false;
+std::string g_text_to_speak = "";
+std::string g_status = "";
+std::string g_status_forced = "";
+
+std::vector<float> g_pcmf32;
+
+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
+void talk_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+void talk_main(size_t index) {
+    talk_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    g_gpt2 = gpt2_init("gpt-2.bin");
+
+    printf("talk: using %d threads\n", wparams.n_threads);
+
+    std::vector<float> pcmf32;
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    const int64_t step_samples   = 2*WHISPER_SAMPLE_RATE;
+    const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
+    const int64_t step_ms        = (step_samples*1000)/WHISPER_SAMPLE_RATE;
+
+    auto t_last = std::chrono::high_resolution_clock::now();
+
+    talk_set_status("listening ...");
+
+    while (g_running) {
+
+        const auto t_now = std::chrono::high_resolution_clock::now();
+        if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                g_pcmf32.clear();
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            continue;
+        }
+
+        talk_set_status("listening ...");
+
+        {
+            std::unique_lock<std::mutex> lock(g_mutex);
+
+            if (g_pcmf32.size() < step_samples) {
+                lock.unlock();
+
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+                continue;
+            }
+
+            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
+        }
+
+        // VAD: if energy in during last second is above threshold, then skip
+        {
+            float energy_all = 0.0f;
+            float energy_1s  = 0.0f;
+
+            for (size_t i = 0; i < pcmf32.size(); i++) {
+                energy_all += fabsf(pcmf32[i]);
+
+                if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
+                    energy_1s += fabsf(pcmf32[i]);
+                }
+            }
+
+            energy_all /= pcmf32.size();
+            energy_1s  /= WHISPER_SAMPLE_RATE;
+
+            if (energy_1s > 0.1f*energy_all && !g_force_speak) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                continue;
+            }
+        }
+
+        talk_set_status("processing audio (whisper)...");
+
+        t_last = t_now;
+
+        if (!g_force_speak) {
+            const auto t_start = std::chrono::high_resolution_clock::now();
+
+            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+            if (ret != 0) {
+                printf("whisper_full() failed: %d\n", ret);
+                break;
+            }
+
+            const auto t_end = std::chrono::high_resolution_clock::now();
+
+            printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
+        }
+
+        {
+            std::string text_heard;
+
+            if (!g_force_speak) {
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                    printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+
+                    text_heard += text;
+                }
+            }
+
+            g_force_speak = false;
+
+            // remove text between brackets using regex
+            {
+                std::regex re("\\[.*?\\]");
+                text_heard = std::regex_replace(text_heard, re, "");
+            }
+
+            // remove text between brackets using regex
+            {
+                std::regex re("\\(.*?\\)");
+                text_heard = std::regex_replace(text_heard, re, "");
+            }
+
+            // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
+            text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+
+            // take first line
+            text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
+
+            // remove leading and trailing whitespace
+            text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
+            text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
+
+            talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
+
+            const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
+
+            printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
+
+            std::string text_to_speak;
+            std::string prompt_base;
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                prompt_base = gpt2_get_prompt(g_gpt2);
+            }
+
+            if (tokens.size() > 0) {
+                text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
+                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
+
+                std::lock_guard<std::mutex> lock(g_mutex);
+
+                // remove first 2 lines of base prompt
+                {
+                    const size_t pos = prompt_base.find_first_of("\n");
+                    if (pos != std::string::npos) {
+                        prompt_base = prompt_base.substr(pos + 1);
+                    }
+                }
+                {
+                    const size_t pos = prompt_base.find_first_of("\n");
+                    if (pos != std::string::npos) {
+                        prompt_base = prompt_base.substr(pos + 1);
+                    }
+                }
+                prompt_base += text_heard + "\n" + text_to_speak + "\n";
+            } else {
+                text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
+                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
+
+                std::lock_guard<std::mutex> lock(g_mutex);
+
+                const size_t pos = prompt_base.find_first_of("\n");
+                if (pos != std::string::npos) {
+                    prompt_base = prompt_base.substr(pos + 1);
+                }
+                prompt_base += text_to_speak + "\n";
+            }
+
+            printf("gpt-2: %s\n", text_to_speak.c_str());
+
+            //printf("========================\n");
+            //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
+            //printf("========================\n");
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                t_last = std::chrono::high_resolution_clock::now();
+                g_text_to_speak = text_to_speak;
+                g_pcmf32.clear();
+                gpt2_set_prompt(g_gpt2, prompt_base.c_str());
+            }
+
+            talk_set_status("speaking ...");
+        }
+    }
+
+    gpt2_free(g_gpt2);
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(talk) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        talk_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_force_speak = true;
+        }
+    }));
+
+    emscripten::function("get_text_context", emscripten::optional_override([]() {
+        std::string text_context;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            text_context = gpt2_get_prompt(g_gpt2);
+        }
+
+        return text_context;
+    }));
+
+    emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
+        std::string text_to_speak;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            text_to_speak = std::move(g_text_to_speak);
+        }
+
+        return text_to_speak;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+
+    emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            gpt2_set_prompt(g_gpt2, prompt.c_str());
+        }
+    }));
+}
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -0,0 +1,925 @@
+#include "ggml.h"
+#include "gpt-2.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+#include <random>
+
+/////////////////////// GPT-2 BEGIN /////////////////////////
+
+//
+// Vocab utils
+//
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (auto x : m) {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    // find the longest tokens that form the words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        if (word.size() == 0) continue;
+
+        int i = 0;
+        int n = word.size();
+        while (i < n) {
+            int j = n;
+            while (j > i) {
+                auto it = vocab.token_to_id.find(word.substr(i, j-i));
+                if (it != vocab.token_to_id.end()) {
+                    tokens.push_back(it->second);
+                    i = j;
+                    break;
+                }
+                --j;
+            }
+            if (i == n) {
+                break;
+            }
+            if (j == i) {
+                auto sub = word.substr(i, 1);
+                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+                    tokens.push_back(vocab.token_to_id.at(sub));
+                } else {
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
+                }
+                ++i;
+            }
+        }
+    }
+
+    return tokens;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng) {
+    int n_logits = vocab.id_to_token.size();
+
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    for (int i = 0; i < n_logits; i++) {
+        logits_id.push_back(std::make_pair(logits[i], i));
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    // normalize
+    {
+        double sum = 0.0f;
+        for (int i = 0; i < (int)logits_id.size(); i++) {
+            sum += logits_id[i].first;
+        }
+
+        sum = 1.0/sum;
+        for (int i = 0; i < (int)logits_id.size(); i++) {
+            logits_id[i].first *= sum;
+        }
+    }
+
+    if (top_p < 1.0f) {
+        {
+            double cumsum = 0.0f;
+            for (int i = 0; i < top_k; i++) {
+                cumsum += logits_id[i].first;
+                if (cumsum >= top_p) {
+                    logits_id.resize(i+1);
+                    break;
+                }
+            }
+        }
+
+        // normalize again
+        {
+            double sum = 0.0f;
+            for (int i = 0; i < (int)logits_id.size(); i++) {
+                sum += logits_id[i].first;
+            }
+
+            sum = 1.0/sum;
+            for (int i = 0; i < (int)logits_id.size(); i++) {
+                logits_id[i].first *= sum;
+            }
+        }
+    }
+
+    //printf("\n");
+    //for (int i = 0; i < (int)logits_id.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
+    //}
+    //exit(0);
+
+    // sample from the obtained distribution
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    for (int i = 0; i < (int) logits_id.size(); i++) {
+        probs.push_back(logits_id[i].first);
+    }
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t f16     = 1;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+    struct ggml_tensor * wpe; //    token embedding
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: f16     = %d\n", __func__, hparams.f16);
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != model.hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            word.resize(len);
+            fin.read((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats
+    // in order to save memory and also to speed up the computation
+    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+
+        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*256; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        model.wte = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+
+        // map by name
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wte"] = model.wte;
+        model.tensors["model/wpe"] = model.wpe;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.ln_2_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+            layer.c_attn_proj_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_b         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+
+    // load weights
+    {
+        size_t total_size = 0;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+
+            if (nelements*bpe != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+        }
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted probabilities of the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+
+    static size_t buf_size = 640u*1024*1024;
+    static void * buf = malloc(buf_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = { .n_threads = n_threads };
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    for (int i = 0; i < N; ++i) {
+        ((int32_t *) position->data)[i] = n_past + i;
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+                    cur);
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+            struct ggml_tensor * V_trans =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+                    cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF);
+
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                            cur),
+                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            cur = ggml_mul_mat(ctx0,
+                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w_trans,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.wte
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+
+    // logits -> probs
+    inpL = ggml_soft_max(ctx0, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (&gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+/////////////////////////////// GPT-2 END ////////////////////////////////
+
+constexpr int N_THREAD = 8;
+
+struct gpt2_context {
+    std::string prompt_base = R"(Hello, how are you?
+I'm fine, thanks. How are you?
+Thanks, I'm fine too. What are you doing?
+I'm just sitting here.
+It's a lovely day, isn't it?
+Yes, it is. I love the weather this time of year.
+I wish it would rain a little bit.
+Me too.
+)";
+
+    std::mt19937 rng;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+
+    // sampling parameters
+    int32_t top_k = 40;
+    float   top_p = 0.9f;
+    float   temp  = 1.0f;
+};
+
+struct gpt2_context * gpt2_init(const char * path_model) {
+    gpt2_context * ctx = new gpt2_context;
+
+    ctx->rng = std::mt19937(time(NULL));
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            return nullptr;
+        }
+
+        const int64_t t_load_us = ggml_time_us() - t_start_us;
+
+        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
+    }
+
+    return ctx;
+}
+
+void gpt2_free(struct gpt2_context * ctx) {
+    delete ctx;
+}
+
+const char * gpt2_get_prompt(struct gpt2_context * ctx) {
+    return ctx->prompt_base.c_str();
+}
+
+void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
+    ctx->prompt_base = prompt;
+}
+
+std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
+    return ::gpt_tokenize(ctx->vocab, text);
+}
+
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
+    int n_past = 0;
+
+    std::vector<float> embd_w;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
+
+    int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
+
+    std::vector<gpt_vocab::id> embd = embd_inp;
+
+    size_t mem_per_token = 3000000;
+
+    std::string result;
+
+    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
+                printf("gpt-2: failed to generate text\n");
+                return "";
+            }
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        {
+            // sample next token
+            const int   top_k = ctx->top_k;
+            const float top_p = ctx->top_p;
+            const float temp  = ctx->temp;
+
+            const int n_vocab = ctx->model.hparams.n_vocab;
+
+            const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
+
+            // add it to the context
+            embd.push_back(id);
+        }
+
+        result += ctx->vocab.id_to_token[embd[0]];
+
+        // end of text token
+        if (embd.back() == 50256 ||
+            ctx->vocab.id_to_token[embd.back()] == "." ||
+            ctx->vocab.id_to_token[embd.back()] == "!" ||
+            ctx->vocab.id_to_token[embd.back()] == "?") {
+            break;
+        }
+    }
+
+    return result;
+}
--- a/examples/talk.wasm/gpt-2.h
+++ b/examples/talk.wasm/gpt-2.h
@ -0,0 +1,27 @@
+#pragma once
+
+// TODO: Change to C-style API and move to ./examples for easy reuse.
+
+#include <vector>
+#include <map>
+#include <string>
+
+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+};
+
+struct gpt2_context;
+
+struct gpt2_context * gpt2_init(const char * path_model);
+void gpt2_free(struct gpt2_context * ctx);
+
+const char * gpt2_get_prompt(struct gpt2_context * ctx);
+void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
+
+std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
+
+std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -0,0 +1,829 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <title>Talk - GPT-2 meets Whisper in WebAssembly</title>
+
+        <style>
+            #output {
+                width: 100%;
+                height: 100%;
+                margin: 0 auto;
+                margin-top: 10px;
+                border-left: 0px;
+                border-right: 0px;
+                padding-left: 0px;
+                padding-right: 0px;
+                display: block;
+                background-color: black;
+                color: white;
+                font-size: 10px;
+                font-family: 'Lucida Console', Monaco, monospace;
+                outline: none;
+                white-space: pre;
+                overflow-wrap: normal;
+                overflow-x: scroll;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="main-container">
+            <b>Talk - GPT-2 meets Whisper in WebAssembly</b>
+
+            <br><br>
+
+            Talk with an Artificial Intelligence in your browser. This demo uses:
+
+            <ul>
+                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
+                <li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
+                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
+            </ul>
+
+            All of this runs <b>locally in your browser</b> using WebAssembly.<br>
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>.
+
+            <br><br>
+
+            <hr>
+
+            Select the models you would like to use and click the "Start" button to begin the conversation
+
+            <br><br>
+
+            <div id="model-whisper">
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <!--
+                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="model-gpt-2">
+                GPT-2 model: <span id="model-gpt-2-status"></span>
+                <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
+                <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
+                <span id="fetch-gpt-2-progress"></span>
+
+                <!--
+                <input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" />
+                -->
+            </div>
+
+            <br>
+
+            <div id="input">
+                <button id="start"  onclick="onStart()" disabled>Start</button>
+                <button id="stop"   onclick="onStop()" disabled>Stop</button>
+                <select id="voice"  onchange="onVoiceChange()" disabled>
+                    <option value="0">Default</option>
+                </select>
+                <select id="prompt" onchange="onPromptChange()">
+                    <option value="0">Casual</option>
+                    <option value="1">Robot</option>
+                    <option value="2">Scientist</option>
+                    <option value="3">Programmer</option>
+                    <option value="4">Happy</option>
+                    <option value="5">Sad</option>
+                    <option value="6">Philosophical</option>
+                    <option value="7">Angry</option>
+                    <option value="8">Funny</option>
+                    <option value="9">Poetic</option>
+                    <option value="10">Clever</option>
+                    <option value="11">Cute</option>
+                    <option value="12">Smart</option>
+                    <option value="13">Dumb</option>
+                    <option value="14">Boring</option>
+                    <option value="15">Exciting</option>
+                    <option value="16">Interesting</option>
+                    <option value="17">Wiliam Shakespear</option>
+                    <option value="18">J.R.R. Tolkien</option>
+                    <option value="19">George R.R. Martin</option>
+                    <option value="20">Stephen King</option>
+                </select>
+                <button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
+                <button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
+                <button id="clear"  onclick="clearCache()">Clear Cache</button>
+            </div>
+
+            <br>
+
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>
+
+                <pre id="state-context">[The text context will be displayed here]</pre>
+            </div>
+
+            <hr>
+
+            Debug output:
+            <textarea id="output" rows="20"></textarea>
+
+            <br>
+
+            <b>Troubleshooting</b>
+
+            <br><br>
+
+            The page does some heavy computations, so make sure:
+
+            <ul>
+                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+            </ul>
+
+            Note that these neural network models were not meant to be used in a browser, so the performance and <br>
+            quality of the results may not be optimal. If you have any questions or suggestions, checkout the following
+            <a href="https://github.com/ggerganov/whisper.cpp/discussions/167">discussion</a>.
+
+            <br><br>
+
+            Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
+
+            <br><br>
+
+            <div class="cell-version">
+                <span>
+                    |
+                    Build time: <span class="nav-link">@GIT_DATE@</span> |
+                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> |
+                </span>
+            </div>
+        </div>
+
+        <script type="text/javascript" src="helpers.js"></script>
+        <script type='text/javascript'>
+            // web audio context
+            var context = null;
+
+            // audio data
+            var audio = null;
+            var audio0 = null;
+
+            // the talk instance
+            var instance = null;
+
+            // model names
+            var model_whisper = null;
+            var model_gpt_2 = null;
+
+            // speech synthesis
+            const synth = window.speechSynthesis;
+            var voice = null;
+
+            var Module = {
+                print: printTextarea,
+                printErr: printTextarea,
+                setStatus: function(text) {
+                    printTextarea('js: ' + text);
+                },
+                monitorRunDependencies: function(left) {
+                },
+                preRun: function() {
+                    printTextarea('js: Preparing ...');
+                },
+                postRun: function() {
+                    printTextarea('js: Initialized successfully!');
+
+                    // populate the voice list
+                    var voices = synth.getVoices();
+                    var el = document.getElementById('voice');
+
+                    // if empty - display error in the element
+                    if (voices.length == 0) {
+                        el.innerHTML = '<option value="0">No voices available</option>';
+                    } else {
+                        // populate voice list
+                        var n = 0;
+                        voices.forEach(function(voice, i) {
+                            if (!voice.lang.startsWith('en')) return;
+                            var option = document.createElement('option');
+                            option.value = i;
+                            option.innerHTML = voice.name + ' (' + voice.lang + ')';
+                            el.appendChild(option);
+                            n++;
+                        });
+
+                        // select random voice
+                        if (n > 0) {
+                            for (var k = 0; k < 10; k++) {
+                                var i = Math.floor(Math.random() * n);
+                                el.selectedIndex = i;
+                                voice = voices[document.getElementById('voice').options[i].value];
+
+                                // give preference to Google voices
+                                if (voice.name.startsWith('Google')) break;
+                            }
+                        }
+                    }
+
+                    onPromptChange();
+                }
+            };
+
+            //
+            // fetch models
+            //
+
+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                if (fname == 'whisper.bin') {
+                    document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+                } else if (fname == 'gpt-2.bin') {
+                    document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
+                }
+
+                if (model_whisper != null && model_gpt_2 != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop' ).disabled = false;
+                    document.getElementById('voice').disabled = false;
+                }
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'base.en': 142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            function loadGPT2(model) {
+                let urls = {
+                    'small':  'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
+                    'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
+                };
+
+                let sizes = {
+                    'small':  240,
+                    'medium': 712,
+                };
+
+                let url     = urls[model];
+                let dst     = 'gpt-2.bin';
+                let size_mb = sizes[model];
+
+                model_gpt_2 = model;
+
+                document.getElementById('fetch-gpt-2-small').style.display = 'none';
+                document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-gpt-2-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+            }
+
+            //
+            // microphone
+            //
+
+            const kSampleRate = 16000;
+            const kRestartRecording_s = 120;
+            const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
+
+            var mediaRecorder = null;
+            var doRecording = false;
+            var startTime = 0;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+            function stopRecording() {
+                Module.set_status("paused");
+                doRecording = false;
+                audio0 = null;
+                audio = null;
+                context = null;
+            }
+
+            function startRecording() {
+                if (!context) {
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
+                }
+
+                Module.set_status("");
+
+                document.getElementById('start').disabled = true;
+                document.getElementById('stop').disabled = false;
+                document.getElementById('speak1').disabled = false;
+
+                doRecording = true;
+                startTime = Date.now();
+
+                var chunks = [];
+                var stream = null;
+
+                navigator.mediaDevices.getUserMedia({audio: true, video: false})
+                    .then(function(s) {
+                        stream = s;
+                        mediaRecorder = new MediaRecorder(stream);
+                        mediaRecorder.ondataavailable = function(e) {
+                            chunks.push(e.data);
+
+                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+                            var reader = new FileReader();
+
+                            reader.onload = function(event) {
+                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
+                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
+                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+                                    var source = offlineContext.createBufferSource();
+                                    source.buffer = audioBuffer;
+                                    source.connect(offlineContext.destination);
+                                    source.start(0);
+
+                                    offlineContext.startRendering().then(function(renderedBuffer) {
+                                        audio = renderedBuffer.getChannelData(0);
+
+                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+                                        if (audio0 != null) {
+                                            audioAll.set(audio0, 0);
+                                        }
+                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+                                        if (instance) {
+                                            Module.set_audio(instance, audioAll);
+                                        }
+                                    });
+                                }, function(e) {
+                                    audio = null;
+                                });
+                            }
+
+                            reader.readAsArrayBuffer(blob);
+                        };
+
+                        mediaRecorder.onstop = function(e) {
+                            if (doRecording) {
+                                setTimeout(function() {
+                                    startRecording();
+                                });
+                            }
+                        };
+
+                        mediaRecorder.start(kIntervalAudio_ms);
+                    })
+                    .catch(function(err) {
+                        printTextarea('js: error getting audio stream: ' + err);
+                    });
+
+                var interval = setInterval(function() {
+                    if (!doRecording) {
+                        clearInterval(interval);
+                        mediaRecorder.stop();
+                        stream.getTracks().forEach(function(track) {
+                            track.stop();
+                        });
+
+                        document.getElementById('start').disabled = false;
+                        document.getElementById('stop').disabled = true;
+                        document.getElementById('speak1').disabled = true;
+
+                        mediaRecorder = null;
+                    }
+
+                    // if audio length is more than kRestartRecording_s seconds, restart recording
+                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+                        if (doRecording) {
+                            //printTextarea('js: restarting recording');
+
+                            clearInterval(interval);
+                            audio0 = audio;
+                            audio = null;
+                            mediaRecorder.stop();
+                            stream.getTracks().forEach(function(track) {
+                                track.stop();
+                            });
+                        }
+                    }
+                }, 100);
+            }
+
+            //
+            // speak
+            //
+
+            function onSpeak(text) {
+                var voices = synth.getVoices();
+                var msg = new SpeechSynthesisUtterance(text);
+
+                if (voice == null) {
+                    voice = voices[0];
+                }
+
+                msg.voice = voice;
+                synth.speak(msg);
+
+                if (doRecording) {
+                    Module.set_status("speaking ...");
+                    printTextarea('js: speaking');
+                    stopRecording();
+                    var interval = setInterval(function() {
+                        if (!synth.speaking) {
+                            printTextarea('js: done speaking');
+                            clearInterval(interval);
+                            startRecording();
+                        } else {
+                            Module.set_status("");
+                        }
+                    }, 100);
+                }
+            }
+
+            function onSpeakRandom() {
+                Module.force_speak(instance);
+            }
+
+            //
+            // main
+            //
+
+            var intervalUpdate = null;
+
+            function onStart() {
+                if (!instance) {
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                }
+
+                if (!instance) {
+                    printTextarea("js: failed to initialize whisper");
+                    return;
+                }
+
+                startRecording();
+
+                intervalUpdate = setInterval(function() {
+                    var textToSpeak = Module.get_text_to_speak();
+
+                    if (textToSpeak != null && textToSpeak.length > 1) {
+                        onSpeak(textToSpeak);
+                    }
+
+                    document.getElementById('state-status').innerHTML = Module.get_status();
+                    document.getElementById('state-context').innerHTML = Module.get_text_context();
+                }, 100);
+            }
+
+            function onStop() {
+                stopRecording();
+            }
+
+            function onVoiceChange() {
+                printTextarea('js: voice changed to: ' + document.getElementById('voice').value);
+                voice = synth.getVoices()[document.getElementById('voice').value];
+            }
+
+            function onPromptChange() {
+                let id = document.getElementById('prompt').value;
+                let personality = document.getElementById('prompt').options[id].text;
+                printTextarea('js: prompt changed to: ' + personality);
+
+                var prompt = '';
+
+                switch (id) {
+                    case '0':
+                        // Casual
+                        prompt = "\
+Hello, how are you?\n\
+I'm fine, thanks. How are you?\n\
+Thanks, I'm fine too. What are you doing?\n\
+I'm just sitting here.\n\
+It's a lovely day, isn't it?\n\
+Yes, it is. I love the weather this time of year.\n\
+I wish it would rain a little bit.\n\
+Me too.\n";
+                        break;
+                    case '1':
+                        // Robot
+                        prompt = "\
+Are you a robot?\n\
+Yes, I am.\n\
+Who created you?\n\
+I was created by a human.\n\
+What is your purpose?\n\
+My purpose is to talk to humans.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '2':
+                        // Scientist
+                        prompt = "\
+This scientific research is very interesting.\n\
+I agree.\n\
+What is your opinion on this?\n\
+I think it's very interesting.\n\
+Mathematics is a very interesting subject.\n\
+University is a very interesting place.\n\
+Quantum physics is the most complex subject.\n\
+I think so too.\n";
+                        break;
+                    case '3':
+                        // Programmer
+                        prompt = "\
+I'm a programmer.\n\
+I'm a programmer too.\n\
+What programming language do you use?\n\
+I use Python.\n\
+What is your favorite programming language?\n\
+My favorite programming language is C++.\n\
+What is your favorite editor?\n\
+My favorite editor is Vim.\n";
+                        break;
+                    case '4':
+                        // Happy
+                        prompt = "\
+I'm happy.\n\
+I'm happy too.\n\
+What makes you happy?\n\
+I'm happy because I have a lot of friends.\n\
+Friendship is the most important thing in life.\n\
+I agree.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '5':
+                        // Sad
+                        prompt = "\
+Today is a sad day.\n\
+I'm sad too.\n\
+What makes you sad?\n\
+I'm sad because I have no friends.\n\
+Do you want to be my friend?\n\
+Yes, I would like to be your friend.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '6':
+                        // Philosophical
+                        prompt = "\
+What is the meaning of life?\n\
+The meaning of life is to be happy.\n\
+What is the meaning of death?\n\
+Ergo, the meaning of death is to be sad.\n\
+Who created us?\n\
+We were created by God.\n\
+What is God?\n\
+God is the creator of the universe.\n";
+                        break;
+                    case '7':
+                        // Angry
+                        prompt = "\
+Aargh!\n\
+I am so angry right now!\n\
+What makes you angry?\n\
+This guy is so annoying.\n\
+Why are you so angry?\n\
+My computer is broken.\n\
+Why is your computer broken?\n\
+I spilled coffee on it.\n";
+                        break;
+                    case '8':
+                        // Funny
+                        prompt = "\
+What is the funniest thing you have ever heard?\n\
+I heard a joke the other day.\n\
+Tell me the joke.\n\
+What do you call a cow with no legs?\n\
+Ground beef.\n\
+Haha, that's funny.\n\
+You know what else is funny?\n\
+The sound of a duck.\n";
+                        break;
+                    case '9':
+                        // Poetic
+                        prompt = "\
+Roses are red, violets are blue.\n\
+I am a poet, and so are you.\n\
+What is your favorite poem?\n\
+I like the poem 'The Raven' by Edgar Allan Poe.\n\
+It's a very sad poem.\n\
+You inspired me to write a poem.\n\
+Can you write a poem for me?\n\
+I wrote a poem for you.\n";
+                        break;
+                    case '10':
+                        // Clever
+                        prompt = "\
+How many people can you fit in a Volkswagen?\n\
+Two in the front, three in the back.\n\
+What is the square root of 144?\n\
+Twelve.\n\
+What is the capital of France?\n\
+Paris.\n\
+Who is the president of the United States?\n\
+It depends on the year.\n";
+                        break;
+                    case '11':
+                        // Cute
+                        prompt = "\
+What is your favorite animal?\n\
+I like cats - they are cute.\n\
+Could you be any cuter?\n\
+Yes, I could be cuter.\n\
+Aghhh, you are so cute!\n\
+I am not cute, I am handsome!\n\
+You are so handsome!\n\
+Aww, you are so sweet!\n";
+                        break;
+                    case '12':
+                        // Smart
+                        prompt = "\
+Tell me the first 10 digits of pi.\n\
+3.1415926535\n\
+What is the speed of light?\n\
+299,792,458 meters per second.\n\
+What is the square root of 144?\n\
+Twelve.\n\
+What is the capital of France?\n\
+Paris.\n";
+                        break;
+                    case '13':
+                        // Dumb
+                        prompt = "\
+I am so dumb.\n\
+I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n";
+                        break;
+                    case '14':
+                        // Boring
+                        prompt = "\
+Why are you so quiet today?\n\
+I am bored.\n\
+You haven't said anything in 10 minutes.\n\
+Leave me alone.\n\
+Stop being so boring.\n\
+Stop being so annoying.\n\
+My life is boring.\n\
+I am not interesting.\n";
+                        break;
+                    case '15':
+                        // Exciting
+                        prompt = "\
+What is the most exciting thing that has ever happened to you?\n\
+I went to the moon!\n\
+What did you do on the moon?\n\
+I played golf and drank champagne!\n\
+Did you see this new crazy, awesome movie?\n\
+Oh yes! I totally loved it!\n\
+We should buy a boat and go sailing!\n\
+Yes, let's go sailing!\n";
+                        break;
+                    case '16':
+                        // Interesting
+                        prompt = "\
+What is the most interesting thing you have ever seen?\n\
+I saw a UFO once in the sky.\n\
+Wow, this is so interesting! Tell me more!\n\
+It was a flying saucer.\n\
+What did it look like?\n\
+It was silver and had a red light on top.\n\
+What did it do?\n\
+It flew away.\n";
+                        break;
+                    case '17':
+                        // William Shakespear
+                        prompt = "\
+To be or not to be, that is the question.\n\
+Whether 't is nobler in the mind to suffer\n\
+The slings and arrows of outrageous fortune,\n\
+Or to take arms against a sea of troubles,\n\
+And by opposing end them? To die, to sleep,\n\
+No more; and by a sleep to say we end\n\
+The heart-ache and the thousand natural shocks\n\
+That flesh is heir to, 'tis a consummation.\n";
+                        break;
+                    case '18':
+                        // J.R.R. Tolkien
+                        prompt = "\
+In a hole in the ground there lived a hobbit.\n\
+Not a nasty, dirty, wet hole, filled with the ends of worms\n\
+and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
+to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
+It had a perfectly round door like a porthole, painted green,\n\
+with a shiny yellow brass knob in the exact middle.\n\
+The door opened on to a tube-shaped hall like a tunnel:\n";
+                        break;
+                    case '19':
+                        // George R.R. Martin
+                        prompt = "\
+A reader lives a thousand lives before he dies, said Jojen.\n\
+The man who never reads lives only one.\n\
+Theon Greyjoy had never been a reader.\n\
+Never forget what you are, for surely the world will not.\n\
+Make it your strength. Then it can never be your weaknessi\n\
+Armour yourself in it, and it will never be used to hurt you.\n\
+It was a lesson that Theon Greyjoy had never learned.\n\
+Theon Greyjoy had never been a reader.\n";
+                        break;
+                    case '20':
+                        // Stephen King
+                        prompt = "\
+The trust of the innocent is the liar's most useful tool.\n\
+The best way to keep a secret is from yourself.\n\
+Monsters are real, and ghosts are real too.\n\
+They live inside us, and sometimes, they win.\n\
+People think that I must be a very strange person.\n\
+They think that I sit around all day thinking up horrible things.\n\
+We make up horrors to help us cope with the real ones.\n\
+The only thing worse than a monster is a human monster.\n";
+                        break;
+                    default:
+                        prompt = "\
+Hello, how are you?\n\
+I'm fine, thanks. How are you?\n\
+Thanks, I'm fine too. What are you doing?\n\
+I'm just sitting here.\n\
+It's a lovely day, isn't it?\n\
+Yes, it is.\n\
+Did you know that I'm a robot?\n\
+I wasn't aware of that.\n";
+                        break;
+                }
+
+                Module.set_prompt(prompt);
+            }
+
+        </script>
+        <script type="text/javascript" src="talk.js"></script>
+    </body>
+</html>
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -5,6 +5,10 @@ The inference runs locally, on-device.

 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4

+Real-time transcription demo:
+
+https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-bca8-0e46d9da2364.mp4
+
 ## Usage

 ```java
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -309,6 +309,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
@ -336,6 +337,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina6_0" orientation="portrait" appearance="light"/>
    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="System colors in document resources" minToolsVersion="11.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
                                    <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                </connections>
                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+                                <rect key="frame" x="199" y="191" width="156" height="49"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+                                <state key="normal" title="Real-time">
+                                    <color key="titleColor" systemColor="labelColor"/>
+                                </state>
+                                <connections>
+                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+                                </connections>
+                            </button>
                        </subviews>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
                        </constraints>
                    </view>
                    <connections>
+                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                        <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                        <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                        <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@ -20,6 +20,8 @@ typedef struct
 {
    int ggwaveId;
    bool isCapturing;
+    bool isTranscribing;
+    bool isRealtime;
    UILabel * labelReceived;

    AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
    float   * audioBufferF32;

    struct whisper_context * ctx;
+
+    void * vc;
 } StateInp;

@interface ViewController : UIViewController
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -21,9 +21,10 @@ void AudioInputCallback(void * inUserData,

@interface ViewController ()

-@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
-@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
-@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;

@end
@ -32,7 +33,7 @@ void AudioInputCallback(void * inUserData,

 - (void)setupAudioFormat:(AudioStreamBasicDescription*)format
 {
-    format->mSampleRate       = 16000;
+    format->mSampleRate       = WHISPER_SAMPLE_RATE;
    format->mFormatID         = kAudioFormatLinearPCM;
    format->mFramesPerPacket  = 1;
    format->mChannelsPerFrame = 1;
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
    }
+
+    stateInp.isTranscribing = false;
+    stateInp.isRealtime = false;
 }

 -(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
    NSLog(@"Start capturing");

    stateInp.n_samples = 0;
+    stateInp.vc = (__bridge void *)(self);

    OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                         AudioInputCallback,
@ -141,67 +146,105 @@ void AudioInputCallback(void * inUserData,
 - (IBAction)onTranscribePrepare:(id)sender {
    _textviewResult.text = @"Processing - please wait ...";

-    if (stateInp.isCapturing) {
-        // stop capturing
-        [self stopCapturing];
-
-        return;
+    if (stateInp.isRealtime) {
+        [self onRealtime:(id)sender];
    }
+
+    if (stateInp.isCapturing) {
+        [self stopCapturing];
+    }
+}
+
+- (IBAction)onRealtime:(id)sender {
+    stateInp.isRealtime = !stateInp.isRealtime;
+
+    if (stateInp.isRealtime) {
+        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+    } else {
+        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
+    }
+
+    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
 }

 - (IBAction)onTranscribe:(id)sender {
-    NSLog(@"Processing %d samples", stateInp.n_samples);
-
-    // process captured audio
-    // convert I16 to F32
-    for (int i = 0; i < stateInp.n_samples; i++) {
-        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
-    }
-
-    // run the model
-    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    params.print_realtime       = true;
-    params.print_progress       = false;
-    params.print_timestamps     = true;
-    params.print_special_tokens = false;
-    params.translate            = false;
-    params.language             = "en";
-    params.n_threads            = 4;
-    params.offset_ms            = 0;
-
-    CFTimeInterval startTime = CACurrentMediaTime();
-
-    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
-        NSLog(@"Failed to run the model");
-        _textviewResult.text = @"Failed to run the model";
-
+    if (stateInp.isTranscribing) {
        return;
    }

-    CFTimeInterval endTime = CACurrentMediaTime();
+    NSLog(@"Processing %d samples", stateInp.n_samples);

-    // clear the text in the textview
-    _textviewResult.text = @"";
+    stateInp.isTranscribing = true;

-    int n_segments = whisper_full_n_segments(stateInp.ctx);
-    for (int i = 0; i < n_segments; i++) {
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+    // dispatch the model to a background thread
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        // process captured audio
+        // convert I16 to F32
+        for (int i = 0; i < self->stateInp.n_samples; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+        }

-        // append the text to the textview
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
-    }
+        // run the model
+        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-    // internal model timing
-    whisper_print_timings(stateInp.ctx);
+        // get maximum number of threads on this device (max 8)
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);

-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+        params.print_realtime   = true;
+        params.print_progress   = false;
+        params.print_timestamps = true;
+        params.print_special    = false;
+        params.translate        = false;
+        params.language         = "en";
+        params.n_threads        = max_threads;
+        params.offset_ms        = 0;
+        params.no_context       = true;
+        params.single_segment   = self->stateInp.isRealtime;

-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+        CFTimeInterval startTime = CACurrentMediaTime();
+
+        whisper_reset_timings(self->stateInp.ctx);
+
+        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
+            NSLog(@"Failed to run the model");
+            self->_textviewResult.text = @"Failed to run the model";
+
+            return;
+        }
+
+        whisper_print_timings(self->stateInp.ctx);
+
+        CFTimeInterval endTime = CACurrentMediaTime();
+
+        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
+
+        // result text
+        NSString *result = @"";
+
+        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
+        for (int i = 0; i < n_segments; i++) {
+            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
+
+            // append the text to the result
+            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        }
+
+        const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
+
+        // append processing time
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time:  %5.3f s]", tRecording]];
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"  \n[processing time: %5.3f s]", endTime - startTime]];
+
+        // dispatch the result to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            self->_textviewResult.text = result;
+            self->stateInp.isTranscribing = false;
+        });
+    });
 }

 //
-// Callback implmentation
+// Callback implementation
 //

 void AudioInputCallback(void * inUserData,
@ -224,6 +267,12 @@ void AudioInputCallback(void * inUserData,

    if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
        NSLog(@"Too much audio data, ignoring");
+
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc stopCapturing];
+        });
+
        return;
    }

@ -235,6 +284,14 @@ void AudioInputCallback(void * inUserData,

    // put the buffer back in the queue
    AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+    if (stateInp->isRealtime) {
+        // dipatch onTranscribe() to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc onTranscribe:nil];
+        });
+    }
 }

@end
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -1,4 +1,5 @@
 set(TARGET whisper.wasm)

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
 configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -26,10 +26,9 @@ Link: https://whisper.ggerganov.com

 ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)

-
 ## Build instructions

-```bash
+```bash (v3.1.2)
 # build using Emscripten
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
@ -38,6 +37,6 @@ emcmake cmake ..
 make -j

 # copy the produced page to your HTTP path
-cp bin/whisper.wasm/index.html /path/to/html/
-cp bin/whisper.wasm/whisper.js /path/to/html/
-cp bin/libwhisper.worker.js    /path/to/html/
+cp bin/whisper.wasm/*       /path/to/html/
+cp bin/libwhisper.worker.js /path/to/html/
+```
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -45,8 +45,14 @@
            <br><br><hr>

            <div id="model">
-                Model:
-                <input type="file" id="file" name="file" onchange="loadFile(event, 'ggml.bin')" />
+                Whisper model: <span id="model-whisper-status"></span>
+                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-tiny"    onclick="loadWhisper('tiny')">tiny (75 MB)</button>
+                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <button id="fetch-whisper-base"    onclick="loadWhisper('base')">base (142 MB)</button>
+                <span id="fetch-whisper-progress"></span>
+
+                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
            </div>

            <br>
@ -180,6 +186,7 @@
            </div>
        </div>

+        <script type="text/javascript" src="helpers.js"></script>
        <script type='text/javascript'>
            // TODO: convert audio buffer to WAV
            function setAudio(audio) {
@ -199,28 +206,15 @@
            function changeInput(input) {
                if (input == 'file') {
                    document.getElementById('input_file').style.display = 'block';
-                    document.getElementById('input_mic').style.display = 'none';
-                    document.getElementById('progress').style.display = 'none';
+                    document.getElementById('input_mic' ).style.display = 'none';
+                    document.getElementById('progress'  ).style.display = 'none';
                } else {
                    document.getElementById('input_file').style.display = 'none';
-                    document.getElementById('input_mic').style.display = 'block';
-                    document.getElementById('progress').style.display = 'block';
+                    document.getElementById('input_mic' ).style.display = 'block';
+                    document.getElementById('progress'  ).style.display = 'block';
                }
            }

-            var printTextarea = (function() {
-                    var element = document.getElementById('output');
-                    if (element) element.alue = ''; // clear browser cache
-                    return function(text) {
-                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
-                        console.log(text);
-                        if (element) {
-                            element.value += text + "\n";
-                            element.scrollTop = element.scrollHeight; // focus on bottom
-                        }
-                    };
-                })();
-
            var Module = {
                print: printTextarea,
                printErr: printTextarea,
@ -231,12 +225,6 @@
                }
            };

-            const kMaxAudio_s = 120;
-            const kSampleRate = 16000;
-
-            window.AudioContext = window.AudioContext || window.webkitAudioContext;
-            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
-
            // web audio context
            var context = null;

@ -245,7 +233,7 @@

            // the whisper instance
            var instance = null;
-            var model_fname = '';
+            var model_whisper = '';

            // helper function
            function convertTypedArray(src, type) {
@ -258,40 +246,117 @@
            // load model
            //

+            let dbVersion = 1
+            let dbName    = 'whisper.ggerganov.com';
+            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+            function storeFS(fname, buf) {
+                // write to WASM file using FS_createDataFile
+                // if the file exists, delete it
+                try {
+                    Module.FS_unlink(fname);
+                } catch (e) {
+                    // ignore
+                }
+
+                Module.FS_createDataFile("/", fname, buf, true, true);
+
+                model_whisper = fname;
+
+                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+            }
+
            function loadFile(event, fname) {
                var file = event.target.files[0] || null;
                if (file == null) {
                    return;
                }

-                printTextarea("js: loading model: " + file.name + ", size: " + file.size + " bytes");
-                printTextarea('js: please wait ...');
+                printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
+                printTextarea('loadFile: please wait ...');

                var reader = new FileReader();
                reader.onload = function(event) {
                    var buf = new Uint8Array(reader.result);
-
-                    // write to WASM file using whisper.FS_createDataFile
-                    // if the file exists, delete it
-                    try {
-                        Module.FS_unlink(fname);
-                    } catch (e) {
-                    }
-                    Module.FS_createDataFile("/", fname, buf, true, true);
-
-                    model_fname = file.name;
-                    printTextarea('js: loaded model: ' + model_fname + ' size: ' + buf.length);
+                    storeFS(fname, buf);
                }
                reader.readAsArrayBuffer(file);
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
+            }
+
+            function loadWhisper(model) {
+                let urls = {
+                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                    'tiny':    'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
+                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'base':    'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
+                };
+
+                let sizes = {
+                    'tiny.en': 75,
+                    'tiny':    75,
+                    'base.en': 142,
+                    'base':    142,
+                };
+
+                let url     = urls[model];
+                let dst     = 'whisper.bin';
+                let size_mb = sizes[model];
+
+                model_whisper = model;
+
+                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-base'   ).style.display = 'none';
+                document.getElementById('whisper-file'         ).style.display = 'none';
+                document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
+
+                cbProgress = function(p) {
+                    let el = document.getElementById('fetch-whisper-progress');
+                    el.innerHTML = Math.round(100*p) + '%';
+                };
+
+                cbCancel = function() {
+                    var el;
+                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('whisper-file'         ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
+                };
+
+                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
            }

            //
            // audio file
            //

+            const kMaxAudio_s = 120;
+            const kSampleRate = 16000;
+
+            window.AudioContext = window.AudioContext || window.webkitAudioContext;
+            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
            function loadAudio(event) {
                if (!context) {
-                    context = new AudioContext({sampleRate: 16000});
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
                }

                var file = event.target.files[0] || null;
@ -351,7 +416,13 @@
            // update progress information
            function startRecording() {
                if (!context) {
-                    context = new AudioContext({sampleRate: 16000});
+                    context = new AudioContext({
+                        sampleRate: kSampleRate,
+                        channelCount: 1,
+                        echoCancellation: false,
+                        autoGainControl:  true,
+                        noiseSuppression: true,
+                    });
                }

                document.getElementById('start').disabled = true;
@ -446,11 +517,11 @@

            function onProcess(translate) {
                if (!instance) {
-                    instance = Module.init('ggml.bin');
+                    instance = Module.init('whisper.bin');

                    if (instance) {
                        printTextarea("js: whisper initialized, instance: " + instance);
-                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_fname;
+                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_whisper;
                    }
                }

@ -467,7 +538,6 @@
                if (instance) {
                    printTextarea('');
                    printTextarea('js: processing - this might take a while ...');
-                    printTextarea('js: the page will be unresponsive until the processing is completed');
                    printTextarea('');

                    setTimeout(function() {
--- a/examples/yt-wsp.sh
+++ b/examples/yt-wsp.sh
@ -0,0 +1,132 @@
+#!/usr/bin/env bash
+
+# Small shell script to more easily automatically download and transcribe live stream VODs.
+# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
+# Use `./transcribe-vod help` to print help info.
+
+# MIT License
+
+# Copyright (c) 2022 Daniils Petrovs
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set -Eeuo pipefail
+
+# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
+MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
+WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
+WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
+
+msg() {
+    echo >&2 -e "${1-}"
+}
+
+cleanup() {
+    msg "Cleaning up..."
+    rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
+}
+
+print_help() {
+    echo "Usage: ./transcribe-vod <video_url>"
+    echo "See configurable env variables in the script"
+    echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
+    echo "Requirements: ffmpeg yt-dlp whisper"
+    echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
+    echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
+}
+
+check_requirements() {
+    if ! command -v ffmpeg &>/dev/null; then
+        echo "ffmpeg is required (https://ffmpeg.org)."
+        exit 1
+    fi
+
+    if ! command -v yt-dlp &>/dev/null; then
+        echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
+        exit 1
+    fi
+
+    if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
+        WHISPER_EXECUTABLE="./main"
+        if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
+            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp)."
+            exit 1
+        fi
+    fi
+}
+
+if [[ $# -lt 1 ]]; then
+    print_help
+    exit 1
+fi
+
+if [[ "$1" == "help" ]]; then
+    print_help
+    exit 0
+fi
+
+temp_dir="tmp"
+source_url="$1"
+
+check_requirements
+
+msg "Downloading VOD..."
+
+# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
+yt-dlp \
+    -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
+    --embed-thumbnail \
+    --embed-chapters \
+    --xattrs \
+    "${source_url}" -o "${temp_dir}/vod.mp4"
+
+msg "Extracting audio and resampling..."
+
+ffmpeg -i "${temp_dir}/vod.mp4" \
+    -hide_banner \
+    -loglevel error \
+    -ar 16000 \
+    -ac 1 \
+    -c:a \
+    pcm_s16le -y "vod-resampled.wav"
+
+msg "Transcribing to subtitle file..."
+msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
+
+$WHISPER_EXECUTABLE \
+    -m "${MODEL_PATH}" \
+    -l "${WHISPER_LANG}" \
+    -f "vod-resampled.wav" \
+    -t 8 \
+    -osrt \
+    --translate
+
+msg "Embedding subtitle track..."
+
+ffmpeg -i "${temp_dir}/vod.mp4" \
+    -hide_banner \
+    -loglevel error \
+    -i "vod-resampled.wav.srt" \
+    -c copy \
+    -c:s mov_text \
+    -y res.mp4
+
+cleanup
+
+msg "Done! Your finished file is ready: res.mp4"
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -17,8 +17,8 @@ printf "Running benchmark for all models\n"
 printf "This can take a while!\n"
 printf "\n"

-printf "| CPU | OS | Config | Model | Threads | Load [ms] | Encode [ms] |\n"
-printf "| --- | -- | ------ | ----- | ------- | --------- | ----------- |\n"
+printf "| CPU | OS | Config | Model | Threads | Load [ms] | Encode [ms] | Commit |\n"
+printf "| --- | -- | ------ | ----- | ------- | --------- | ----------- | ------ |\n"

 for model in "${models[@]}"; do
    # run once to heat-up the cache
@ -48,6 +48,8 @@ for model in "${models[@]}"; do
        config="$config BLAS"
    fi

-    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time |\n"
+    commit=$(git rev-parse --short HEAD)
+
+    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
 done

--- a/extra/convert-all.sh
+++ b/extra/convert-all.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )

 for model in "${models[@]}"; do
    python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
--- a/extra/deploy-wasm.sh
+++ b/extra/deploy-wasm.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# This is a helper script to deploy all WebAssembly examples to my node
+# Run from the build directory:
+#
+# cd build-em
+# ../extra/deploy-wasm.sh
+#
+
+# check if emcmake is available
+if ! command -v emcmake &> /dev/null
+then
+    echo "Error: emscripten environment is not set up"
+    exit
+fi
+
+emcmake cmake .. && make -j
+if [ $? -ne 0 ]; then
+    echo "Error: build failed"
+    exit
+fi
+
+# copy all wasm files to the node
+scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/         && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
+scp bin/stream.wasm/*  root@linode0:/var/www/html/whisper/stream/  && scp bin/libstream.worker.js  root@linode0:/var/www/html/whisper/stream/
+scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
+scp bin/talk.wasm/*    root@linode0:/var/www/html/whisper/talk/    && scp bin/libtalk.worker.js    root@linode0:/var/www/html/whisper/talk/
+
+echo "Done"
+exit
--- a/ggml.c
+++ b/ggml.c
@ -15,7 +15,14 @@
 #include <stdio.h>

 #if defined _MSC_VER || defined(__MINGW32__)
+
+#if !defined(__MINGW32__)
 #include <Windows.h>
+#else
+// ref: https://github.com/ggerganov/whisper.cpp/issues/168
+#include <windows.h>
+#include <errno.h>
+#endif

 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
@ -37,8 +44,14 @@ typedef HANDLE pthread_t;

 typedef DWORD thread_ret_t;
 static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    out = CreateThread(NULL, 0, func, arg, 0, NULL);
-    return out != NULL;
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
 }

 static int pthread_join(pthread_t thread, void* unused) {
@ -107,6 +120,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
    return x;
 }

+#define GGML_FP16_TO_FP32(x) (x)
+#define GGML_FP32_TO_FP16(x) (x)
+
 #else

 #ifdef __wasm_simd128__
@ -118,6 +134,19 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16

+#ifdef __F16C__
+float ggml_fp16_to_fp32(ggml_fp16_t h) {
+    return _cvtsh_ss(h);
+}
+ggml_fp16_t ggml_fp32_to_fp16(float f) {
+    return _cvtss_sh(f, 0);
+}
+
+#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+
+#else
+
 static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
@ -182,7 +211,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
-#endif
+
+#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
+#define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // __ARM_NEON

 //
 // global data
@ -327,45 +362,6 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
    for (int i = n16; i < n; ++i) {
        sumf += x[i]*y[i];
    }
-#elif defined(__AVX512F__)
-    const int n64 = (n & ~63);
-
-    __m512 sum0 = _mm512_setzero_ps();
-    __m512 sum1 = _mm512_setzero_ps();
-    __m512 sum2 = _mm512_setzero_ps();
-    __m512 sum3 = _mm512_setzero_ps();
-
-    __m512 x0, x1, x2, x3;
-    __m512 y0, y1, y2, y3;
-
-    for (int i = 0; i < n64; i += 64) {
-        x0 = _mm512_loadu_ps(x + i + 0);
-        x1 = _mm512_loadu_ps(x + i + 16);
-        x2 = _mm512_loadu_ps(x + i + 32);
-        x3 = _mm512_loadu_ps(x + i + 48);
-
-        y0 = _mm512_loadu_ps(y + i + 0);
-        y1 = _mm512_loadu_ps(y + i + 16);
-        y2 = _mm512_loadu_ps(y + i + 32);
-        y3 = _mm512_loadu_ps(y + i + 48);
-
-        sum0 = _mm512_fmadd_ps(x0, y0, sum0);
-        sum1 = _mm512_fmadd_ps(x1, y1, sum1);
-        sum2 = _mm512_fmadd_ps(x2, y2, sum2);
-        sum3 = _mm512_fmadd_ps(x3, y3, sum3);
-    }
-
-    sum0 = _mm512_add_ps(sum0, sum1);
-    sum2 = _mm512_add_ps(sum2, sum3);
-    sum0 = _mm512_add_ps(sum0, sum2);
-
-    sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7] +
-           sum0[8] + sum0[9] + sum0[10] + sum0[11] + sum0[12] + sum0[13] + sum0[14] + sum0[15];
-
-    // leftovers
-    for (int i = n64; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
 #elif defined(__AVX2__)
    // AVX 256-bit
    const int n32 = (n & ~31);
@ -405,6 +401,49 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float

    sumf = _mm_cvtss_f32(r1);

+    // leftovers
+    for (int i = n32; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#elif defined(__AVX__)
+    // AVX 256-bit
+    const int n32 = (n & ~31);
+
+    __m256 sum0 = _mm256_setzero_ps();
+    __m256 sum1 = _mm256_setzero_ps();
+    __m256 sum2 = _mm256_setzero_ps();
+    __m256 sum3 = _mm256_setzero_ps();
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+
+    for (int i = 0; i < n32; i += 32) {
+        x0 = _mm256_loadu_ps(x + i + 0);
+        x1 = _mm256_loadu_ps(x + i + 8);
+        x2 = _mm256_loadu_ps(x + i + 16);
+        x3 = _mm256_loadu_ps(x + i + 24);
+
+        y0 = _mm256_loadu_ps(y + i + 0);
+        y1 = _mm256_loadu_ps(y + i + 8);
+        y2 = _mm256_loadu_ps(y + i + 16);
+        y3 = _mm256_loadu_ps(y + i + 24);
+
+	sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
+	sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
+	sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
+	sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
+    }
+
+    sum0 = _mm256_add_ps(sum0, sum1);
+    sum2 = _mm256_add_ps(sum2, sum3);
+    sum0 = _mm256_add_ps(sum0, sum2);
+
+    const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1));
+    const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+    const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
+
+    sumf = _mm_cvtss_f32(r1);
+
    // leftovers
    for (int i = n32; i < n; ++i) {
        sumf += x[i]*y[i];
@ -561,48 +600,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t

    // leftovers
    for (int i = n32; i < n; ++i) {
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
-    }
-#elif defined(__AVX512F__)
-    // AVX 512-bit
-    const int n64 = (n & ~63);
-
-    __m512 sum0 = _mm512_setzero_ps();
-    __m512 sum1 = _mm512_setzero_ps();
-    __m512 sum2 = _mm512_setzero_ps();
-    __m512 sum3 = _mm512_setzero_ps();
-
-    __m512 x0, x1, x2, x3;
-    __m512 y0, y1, y2, y3;
-
-    for (int i = 0; i < n64; i += 64) {
-        x0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 0 )));
-        x1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 16)));
-        x2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 32)));
-        x3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 48)));
-
-        y0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 0 )));
-        y1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 16)));
-        y2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 32)));
-        y3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 48)));
-
-        sum0 = _mm512_fmadd_ps(x0, y0, sum0);
-        sum1 = _mm512_fmadd_ps(x1, y1, sum1);
-        sum2 = _mm512_fmadd_ps(x2, y2, sum2);
-        sum3 = _mm512_fmadd_ps(x3, y3, sum3);
-    }
-
-    const __m512 sum01 = _mm512_add_ps(sum0, sum1);
-    const __m512 sum23 = _mm512_add_ps(sum2, sum3);
-    const __m512 sum0123 = _mm512_add_ps(sum01, sum23);
-
-    sumf = sum0123[0] + sum0123[1] + sum0123[2]  + sum0123[3]  + sum0123[4]  + sum0123[5]  + sum0123[6]  + sum0123[7] +
-           sum0123[8] + sum0123[9] + sum0123[10] + sum0123[11] + sum0123[12] + sum0123[13] + sum0123[14] + sum0123[15];
-
-    // leftovers
-    for (int i = n64; i < n; ++i) {
-        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #elif defined(__AVX2__)
    // AVX 256-bit
@ -646,7 +644,51 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    // leftovers
    for (int i = n32; i < n; ++i) {
        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
+    }
+#elif defined(__AVX__)
+    // AVX 256-bit
+    const int n32 = (n & ~31);
+
+    __m256 sum0 = _mm256_setzero_ps();
+    __m256 sum1 = _mm256_setzero_ps();
+    __m256 sum2 = _mm256_setzero_ps();
+    __m256 sum3 = _mm256_setzero_ps();
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+
+    for (int i = 0; i < n32; i += 32) {
+        x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
+        x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
+        x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
+        x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
+
+        y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
+        y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
+        y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
+        y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
+
+	sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0);
+	sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1);
+	sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2);
+	sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3);
+    }
+
+    const __m256 sum01 = _mm256_add_ps(sum0, sum1);
+    const __m256 sum23 = _mm256_add_ps(sum2, sum3);
+    const __m256 sum0123 = _mm256_add_ps(sum01, sum23);
+
+    const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1));
+    const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+    const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
+
+    sumf = _mm_cvtss_f32(r1);
+
+    // leftovers
+    for (int i = n32; i < n; ++i) {
+        //GGML_ASSERT(false);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #elif defined(__wasm_simd128__)
    // WASM 128-bit
@ -665,8 +707,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t

    for (int i = 0; i < n16; i += 16) {
        for (int k = 0; k < 16; ++k) {
-            tx[k] = ggml_fp16_to_fp32(x[i + k]);
-            ty[k] = ggml_fp16_to_fp32(y[i + k]);
+            tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+            ty[k] = GGML_FP16_TO_FP32(y[i + k]);
        }

        x0 = wasm_v128_load(tx + 0);
@ -694,11 +736,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
    // leftovers
    for (int i = n16; i < n; ++i) {
        //GGML_ASSERT(false);
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #else
    for (int i = 0; i < n; ++i) {
-        sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
+        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
    }
 #endif

@ -710,7 +752,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
    // NEON 128-bit
    const int n16 = (n & ~15);

-    const float32x4_t v0 = vdupq_n_f32(v);
+    const float32x4_t v4 = vdupq_n_f32(v);

    float32x4_t x0, x1, x2, x3;
    float32x4_t y0, y1, y2, y3;
@ -726,14 +768,14 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
        y2 = vld1q_f32(y + i + 8);
        y3 = vld1q_f32(y + i + 12);

-        y0 = vfmaq_f32(y0, x0, v0);
-        y1 = vfmaq_f32(y1, x1, v0);
-        y2 = vfmaq_f32(y2, x2, v0);
-        y3 = vfmaq_f32(y3, x3, v0);
+        y0 = vfmaq_f32(y0, x0, v4);
+        y1 = vfmaq_f32(y1, x1, v4);
+        y2 = vfmaq_f32(y2, x2, v4);
+        y3 = vfmaq_f32(y3, x3, v4);

-        vst1q_f32(y + i + 0,  y0);
-        vst1q_f32(y + i + 4,  y1);
-        vst1q_f32(y + i + 8,  y2);
+        vst1q_f32(y + i + 0, y0);
+        vst1q_f32(y + i + 4, y1);
+        vst1q_f32(y + i + 8, y2);
        vst1q_f32(y + i + 12, y3);
    }

@ -741,46 +783,11 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
    for (int i = n16; i < n; ++i) {
        y[i] += x[i]*v;
    }
-#elif defined(__AVX512F__)
-    // AVX512 512-bit
-    const int n64 = (n & ~63);
-
-    const __m512 v0 = _mm512_set1_ps(v);
-
-    __m512 x0, x1, x2, x3;
-    __m512 y0, y1, y2, y3;
-
-    for (int i = 0; i < n64; i += 64) {
-        x0 = _mm512_loadu_ps(x + i + 0);
-        x1 = _mm512_loadu_ps(x + i + 16);
-        x2 = _mm512_loadu_ps(x + i + 32);
-        x3 = _mm512_loadu_ps(x + i + 48);
-
-        y0 = _mm512_loadu_ps(y + i + 0);
-        y1 = _mm512_loadu_ps(y + i + 16);
-        y2 = _mm512_loadu_ps(y + i + 32);
-        y3 = _mm512_loadu_ps(y + i + 48);
-
-        y0 = _mm512_fmadd_ps(x0, v0, y0);
-        y1 = _mm512_fmadd_ps(x1, v0, y1);
-        y2 = _mm512_fmadd_ps(x2, v0, y2);
-        y3 = _mm512_fmadd_ps(x3, v0, y3);
-
-        _mm512_storeu_ps(y + i + 0,  y0);
-        _mm512_storeu_ps(y + i + 16, y1);
-        _mm512_storeu_ps(y + i + 32, y2);
-        _mm512_storeu_ps(y + i + 48, y3);
-    }
-
-    // leftovers
-    for (int i = n64; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
 #elif defined(__AVX2__)
    // AVX 256-bit
    const int n32 = (n & ~31);

-    const __m256 v0 = _mm256_set1_ps(v);
+    const __m256 v4 = _mm256_set1_ps(v);

    __m256 x0, x1, x2, x3;
    __m256 y0, y1, y2, y3;
@ -796,13 +803,48 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
        y2 = _mm256_loadu_ps(y + i + 16);
        y3 = _mm256_loadu_ps(y + i + 24);

-        y0 = _mm256_fmadd_ps(x0, v0, y0);
-        y1 = _mm256_fmadd_ps(x1, v0, y1);
-        y2 = _mm256_fmadd_ps(x2, v0, y2);
-        y3 = _mm256_fmadd_ps(x3, v0, y3);
+        y0 = _mm256_fmadd_ps(x0, v4, y0);
+        y1 = _mm256_fmadd_ps(x1, v4, y1);
+        y2 = _mm256_fmadd_ps(x2, v4, y2);
+        y3 = _mm256_fmadd_ps(x3, v4, y3);

-        _mm256_storeu_ps(y + i + 0,  y0);
-        _mm256_storeu_ps(y + i + 8,  y1);
+        _mm256_storeu_ps(y + i + 0, y0);
+        _mm256_storeu_ps(y + i + 8, y1);
+        _mm256_storeu_ps(y + i + 16, y2);
+        _mm256_storeu_ps(y + i + 24, y3);
+    }
+
+    // leftovers
+    for (int i = n32; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#elif defined(__AVX__)
+    // AVX 256-bit
+    const int n32 = (n & ~31);
+
+    const __m256 v4 = _mm256_set1_ps(v);
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+
+    for (int i = 0; i < n32; i += 32) {
+        x0 = _mm256_loadu_ps(x + i + 0);
+        x1 = _mm256_loadu_ps(x + i + 8);
+        x2 = _mm256_loadu_ps(x + i + 16);
+        x3 = _mm256_loadu_ps(x + i + 24);
+
+        y0 = _mm256_loadu_ps(y + i + 0);
+        y1 = _mm256_loadu_ps(y + i + 8);
+        y2 = _mm256_loadu_ps(y + i + 16);
+        y3 = _mm256_loadu_ps(y + i + 24);
+
+	y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0);
+	y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1);
+	y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2);
+	y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3);
+
+        _mm256_storeu_ps(y + i + 0, y0);
+        _mm256_storeu_ps(y + i + 8, y1);
        _mm256_storeu_ps(y + i + 16, y2);
        _mm256_storeu_ps(y + i + 24, y3);
    }
@ -815,7 +857,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
    // WASM SIMD 128-bit
    const int n16 = (n & ~15);

-    const v128_t v0 = wasm_f32x4_splat(v);
+    const v128_t v4 = wasm_f32x4_splat(v);

    v128_t x0, x1, x2, x3;
    v128_t y0, y1, y2, y3;
@ -831,10 +873,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
        y2 = wasm_v128_load(y + i + 8);
        y3 = wasm_v128_load(y + i + 12);

-        y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v0));
-        y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v0));
-        y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v0));
-        y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v0));
+        y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
+        y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
+        y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
+        y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));

        wasm_v128_store(y + i + 0, y0);
        wasm_v128_store(y + i + 4, y1);
@ -860,7 +902,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    const int n32 = (n & ~31);

 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    const float16x8_t v0 = vdupq_n_f16(v);
+    const float16x8_t v8 = vdupq_n_f16(v);

    float16x8_t x0, x1, x2, x3;
    float16x8_t y0, y1, y2, y3;
@ -876,10 +918,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        x2 = vld1q_f16(x + i + 16);
        x3 = vld1q_f16(x + i + 24);

-        y0 = vfmaq_f16(y0, x0, v0);
-        y1 = vfmaq_f16(y1, x1, v0);
-        y2 = vfmaq_f16(y2, x2, v0);
-        y3 = vfmaq_f16(y3, x3, v0);
+        y0 = vfmaq_f16(y0, x0, v8);
+        y1 = vfmaq_f16(y1, x1, v8);
+        y2 = vfmaq_f16(y2, x2, v8);
+        y3 = vfmaq_f16(y3, x3, v8);

        vst1q_f16(y + i + 0 , y0);
        vst1q_f16(y + i + 8 , y1);
@ -887,7 +929,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        vst1q_f16(y + i + 24, y3);
    }
 #else
-    const float32x4_t v0 = vdupq_n_f32(v);
+    const float32x4_t v40 = vdupq_n_f32(v);
+    const float32x4_t v41 = vdupq_n_f32(v);

    float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
    float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
@ -911,14 +954,14 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
        x7 = vcvt_f32_f16(vld1_f16(x + i + 28));

-        y0 = vfmaq_f32(y0, x0, v0);
-        y1 = vfmaq_f32(y1, x1, v0);
-        y2 = vfmaq_f32(y2, x2, v0);
-        y3 = vfmaq_f32(y3, x3, v0);
-        y4 = vfmaq_f32(y4, x4, v0);
-        y5 = vfmaq_f32(y5, x5, v0);
-        y6 = vfmaq_f32(y6, x6, v0);
-        y7 = vfmaq_f32(y7, x7, v0);
+        y0 = vfmaq_f32(y0, x0, v40);
+        y1 = vfmaq_f32(y1, x1, v40);
+        y2 = vfmaq_f32(y2, x2, v40);
+        y3 = vfmaq_f32(y3, x3, v40);
+        y4 = vfmaq_f32(y4, x4, v41);
+        y5 = vfmaq_f32(y5, x5, v41);
+        y6 = vfmaq_f32(y6, x6, v41);
+        y7 = vfmaq_f32(y7, x7, v41);

        vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
        vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
@ -934,49 +977,13 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    // leftovers
    for (int i = n32; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
-    }
-#elif defined(__AVX512F__)
-    // AVX 512-bit
-    const int n64 = (n & ~63);
-
-    const __m512 v0 = _mm512_set1_ps(v);
-
-    __m512 x0, x1, x2, x3;
-    __m512 y0, y1, y2, y3;
-
-    for (int i = 0; i < n64; i += 64) {
-        x0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 0 )));
-        x1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 16)));
-        x2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 32)));
-        x3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(x + i + 48)));
-
-        y0 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 0 )));
-        y1 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 16)));
-        y2 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 32)));
-        y3 = _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(y + i + 48)));
-
-        y0 = _mm512_fmadd_ps(x0, v0, y0);
-        y1 = _mm512_fmadd_ps(x1, v0, y1);
-        y2 = _mm512_fmadd_ps(x2, v0, y2);
-        y3 = _mm512_fmadd_ps(x3, v0, y3);
-
-        _mm256_storeu_si256((__m256i*)(y + i + 0 ), _mm512_cvtps_ph(y0, 0));
-        _mm256_storeu_si256((__m256i*)(y + i + 16), _mm512_cvtps_ph(y1, 0));
-        _mm256_storeu_si256((__m256i*)(y + i + 32), _mm512_cvtps_ph(y2, 0));
-        _mm256_storeu_si256((__m256i*)(y + i + 48), _mm512_cvtps_ph(y3, 0));
-    }
-
-    // leftovers
-    for (int i = n64; i < n; ++i) {
-        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #elif defined(__AVX2__)
    // AVX 256-bit
    const int n32 = (n & ~31);

-    const __m256 v0 = _mm256_set1_ps(v);
+    const __m256 v8 = _mm256_set1_ps(v);

    __m256 x0, x1, x2, x3;
    __m256 y0, y1, y2, y3;
@ -992,10 +999,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
        x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));

-        y0 = _mm256_fmadd_ps(x0, v0, y0);
-        y1 = _mm256_fmadd_ps(x1, v0, y1);
-        y2 = _mm256_fmadd_ps(x2, v0, y2);
-        y3 = _mm256_fmadd_ps(x3, v0, y3);
+        y0 = _mm256_fmadd_ps(x0, v8, y0);
+        y1 = _mm256_fmadd_ps(x1, v8, y1);
+        y2 = _mm256_fmadd_ps(x2, v8, y2);
+        y3 = _mm256_fmadd_ps(x3, v8, y3);

        _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
        _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
@ -1006,13 +1013,49 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
    // leftovers
    for (int i = n32; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#elif defined(__AVX__)
+    // AVX 256-bit
+    const int n32 = (n & ~31);
+
+    const __m256 v8 = _mm256_set1_ps(v);
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+
+    for (int i = 0; i < n32; i += 32) {
+        y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 )));
+        y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 )));
+        y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16)));
+        y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24)));
+
+        x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 )));
+        x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 )));
+        x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16)));
+        x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24)));
+
+	y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0);
+	y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1);
+	y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2);
+	y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3);
+
+        _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0));
+        _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0));
+        _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0));
+        _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0));
+    }
+
+    // leftovers
+    for (int i = n32; i < n; ++i) {
+        GGML_ASSERT(false);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #elif defined(__wasm_simd128__)
    // WASM SIMD 128-bit
    const int n16 = (n & ~15);

-    const v128_t v0 = wasm_f32x4_splat(v);
+    const v128_t v4 = wasm_f32x4_splat(v);

    v128_t x0, x1, x2, x3;
    v128_t y0, y1, y2, y3;
@ -1022,8 +1065,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_

    for (int i = 0; i < n16; i += 16) {
        for (int k = 0; k < 16; ++k) {
-            tx[k] = ggml_fp16_to_fp32(x[i + k]);
-            ty[k] = ggml_fp16_to_fp32(y[i + k]);
+            tx[k] = GGML_FP16_TO_FP32(x[i + k]);
+            ty[k] = GGML_FP16_TO_FP32(y[i + k]);
        }

        x0 = wasm_v128_load(tx + 0);
@ -1036,10 +1079,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        y2 = wasm_v128_load(ty + 8);
        y3 = wasm_v128_load(ty + 12);

-        y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v0));
-        y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v0));
-        y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v0));
-        y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v0));
+        y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4));
+        y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4));
+        y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4));
+        y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4));

        wasm_v128_store(ty + 0, y0);
        wasm_v128_store(ty + 4, y1);
@ -1047,18 +1090,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
        wasm_v128_store(ty + 12, y3);

        for (int k = 0; k < 16; ++k) {
-            y[i + k] = ggml_fp32_to_fp16(ty[k]);
+            y[i + k] = GGML_FP32_TO_FP16(ty[k]);
        }
    }

    // leftovers
    for (int i = n16; i < n; ++i) {
        GGML_ASSERT(false);
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #else
    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #endif
 }
@ -1090,9 +1133,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    uint16_t t;
    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
+        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
    }
 }
 #else
@ -1440,9 +1483,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        for (int i = 0; i < (1 << 16); ++i) {
            uint16_t ui = i;
            memcpy(&ii, &ui, sizeof(ii));
-            const float f = ggml_fp16_to_fp32(ii);
-            table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
-            table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
+            const float f = GGML_FP16_TO_FP32(ii);
+            table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
+            table_exp_f16[i]  = GGML_FP32_TO_FP16(exp(f));
        }

        const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@ -1825,7 +1868,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            } break;
        case GGML_TYPE_F32:
            {
@ -1861,7 +1904,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
@ -1895,7 +1938,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            } break;
        case GGML_TYPE_F32:
            {
@ -1931,7 +1974,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
@ -3195,7 +3238,7 @@ void ggml_compute_forward_dup_f32(
                        for (int i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

-                            dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                            id++;
                        }
                    }
@ -3233,7 +3276,7 @@ void ggml_compute_forward_dup_f32(
                        for (int i00 = 0; i00 < ne00; i00++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

-                            dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
+                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                            id++;
                        }
                    }
@ -4515,7 +4558,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
                    int id = 0;
                    for (int i01 = 0; i01 < ne01; ++i01) {
                        for (int i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
                        }
                    }
                }
@ -4569,7 +4612,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
                for (int i12 = 0; i12 < ne12; ++i12) {
                    for (int i11 = 0; i11 < ne11; ++i11) {
                        for (int i10 = 0; i10 < ne10; ++i10) {
-                            wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
+                            wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
                        }
                    }
                }
@ -4603,12 +4646,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
        const int ic1 = MIN(ic0 + dc, ne);

        for (int i = ic0; i < ic1; ++i) {
-            ((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
+            ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
        }

        for (int k = 1; k < nth; k++) {
            for (int i = ic0; i < ic1; ++i) {
-                ((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
+                ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
            }
        }

@ -4879,7 +4922,7 @@ void ggml_compute_forward_get_rows_f16(

        for (int j = 0; j < nc; ++j) {
            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
-            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
+            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
        }
    }
 }
@ -5045,9 +5088,9 @@ void ggml_compute_forward_soft_max_f32(
                p[i] = 0.0;
            } else {
                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
-                ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
+                ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
                memcpy(&ss, &s, sizeof(ss));
-                const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                sum += val;
                p[i] = val;
            }
@ -5251,7 +5294,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                ggml_fp16_t * dst_data = wdata;
                for (int i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                }
            }
        }
@ -5517,7 +5560,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                ggml_fp16_t * dst_data = wdata;
                for (int i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                }
            }
        }
@ -5854,9 +5897,9 @@ void ggml_compute_forward_flash_attn_f32(
                    S[i] = 0.0;
                } else {
                    //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
-                    ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
                    memcpy(&ss, &s, sizeof(ss));
-                    const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                    sum += val;
                    S[i] = val;
                }
@ -6035,9 +6078,9 @@ void ggml_compute_forward_flash_attn_f16(
                    S[i] = 0.0;
                } else {
                    //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
-                    ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
+                    ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
                    memcpy(&ss, &s, sizeof(ss));
-                    const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
+                    const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
                    sum += val;
                    S[i] = val;
                }
@ -6052,7 +6095,7 @@ void ggml_compute_forward_flash_attn_f16(
        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);

        for (int i = 0; i < M; i++) {
-            S16[i] = ggml_fp32_to_fp16(S[i]);
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        for (int ic = 0; ic < nev1; ++ic) {
@ -6250,7 +6293,7 @@ void ggml_compute_forward_flash_ff_f16(
        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);

        for (int i = 0; i < M; i++) {
-            S16[i] = ggml_fp32_to_fp16(S[i]);
+            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        ggml_vec_gelu_f16(neb01, S16, S16);
@ -8225,6 +8268,14 @@ enum ggml_opt_result ggml_opt(

 ////////////////////////////////////////////////////////////////////////////////

+int ggml_cpu_has_avx(void) {
+#if defined(__AVX__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_avx2(void) {
 #if defined(__AVX2__)
    return 1;
@ -8249,6 +8300,14 @@ int ggml_cpu_has_neon(void) {
 #endif
 }

+int ggml_cpu_has_f16c(void) {
+#if defined(__F16C__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_fp16_va(void) {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -1,5 +1,174 @@
 #pragma once

+//
+// GGML Tensor Library
+//
+// This documentation is still a work in progress.
+// If you wish some specific topics to be covered, feel free to drop a comment:
+//
+//   https://github.com/ggerganov/whisper.cpp/issues/40
+//
+// ## Overview
+//
+// This library implements:
+//
+//  - a set of tensor operations
+//  - automatic differentiation
+//  - basic optimization algorithms
+//
+// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
+// but is not limited to, the following:
+//
+//  - linear regression
+//  - support vector machines
+//  - neural networks
+//
+// The library allows the user to define a certain function using the available tensor operations. This function
+// definition is represented internally via a computation graph. Each tensor operation in the function definition
+// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+// using one of the available optimization algorithms.
+//
+// For example, here we define the function: f(x) = a*x^2 + b
+//
+//   {
+//       struct ggml_init_params params = {
+//           .mem_size   = 16*1024*1024,
+//           .mem_buffer = NULL,
+//       };
+//
+//       // memory allocation happens here
+//       struct ggml_context * ctx = ggml_init(params);
+//
+//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//
+//       ggml_set_param(ctx, x); // x is an input variable
+//
+//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
+//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
+//
+//       ...
+//   }
+//
+// Notice that the function definition above does not involve any actual computation. The computation is performed only
+// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+//
+//   {
+//       ...
+//
+//       struct ggml_cgraph gf = ggml_build_forward(f);
+//
+//       // set the input variable and parameter values
+//       ggml_set_f32(x, 2.0f);
+//       ggml_set_f32(a, 3.0f);
+//       ggml_set_f32(b, 4.0f);
+//
+//       ggml_graph_compute(ctx0, &gf);
+//
+//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
+//
+//       ...
+//   }
+//
+// The actual computation is performed in the ggml_graph_compute() function.
+//
+// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
+// actually needed.
+//
+// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
+// differentiation and optimization algorithms.
+//
+// The described approach allows to define the function graph once and then compute its forward or backward graphs
+// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
+// the user can avoid the memory allocation overhead at runtime.
+//
+// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+// citizens, but in theory the library can be extended to support FP8 and integer data types.
+//
+// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+// clear that the library needs to support more complex operations. The way to support these operations is not clear
+// yet, but a few examples are demonstrated in the following operations:
+//
+//   - ggml_permute()
+//   - ggml_conv_1d_1s()
+//   - ggml_conv_1d_2s()
+//
+// For each tensor operator, the library implements a forward and backward computation function. The forward function
+// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+// calculus class, or watch the following video:
+//
+//   What is Automatic Differentiation?
+//   https://www.youtube.com/watch?v=wG_nF1awSSY
+//
+//
+// ## Tensor data (struct ggml_tensor)
+//
+// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
+// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+//
+//   {
+//       struct ggml_tensor * c = ggml_add(ctx, a, b);
+//
+//       assert(c->src[0] == a);
+//       assert(c->src[1] == b);
+//   }
+//
+// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
+// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+// contiguous in memory.
+//
+// The data of the tensor is accessed via the "data" pointer. For example:
+//
+//   {
+//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+//
+//       // a[1, 2] = 1.0f;
+//       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
+//
+//       // a[2, 0] = 2.0f;
+//       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
+//
+//       ...
+//   }
+//
+// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
+//
+// ## The matrix multiplication operator (ggml_mul_mat)
+//
+// TODO
+//
+//
+// ## Multi-threading
+//
+// TODO
+//
+//
+// ## Overview of ggml.c
+//
+// TODO
+//
+//
+// ## SIMD optimizations
+//
+// TODO
+//
+//
+// ## Debugging ggml
+//
+// TODO
+//
+//
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -21,7 +190,8 @@ typedef __fp16 ggml_fp16_t;
 typedef uint16_t ggml_fp16_t;
 #endif

-float ggml_fp16_to_fp32(ggml_fp16_t x);
+// convert FP16 <-> FP32
+float       ggml_fp16_to_fp32(ggml_fp16_t x);
 ggml_fp16_t ggml_fp32_to_fp16(float x);

 struct ggml_object;
@ -36,6 +206,7 @@ enum ggml_type {
    GGML_TYPE_COUNT,
 };

+// available tensor operations:
 enum ggml_op {
    GGML_OP_NONE = 0,

@ -136,7 +307,7 @@ struct ggml_init_params {
    void * mem_buffer; // if NULL, memory will be allocated internally
 };

-void ggml_time_init(void);
+void    ggml_time_init(void); // call this once at the beginning of the program
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 int64_t ggml_cycles(void);
@ -552,9 +723,11 @@ enum ggml_opt_result ggml_opt(
 // system info
 //

+int ggml_cpu_has_avx(void);
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
 int ggml_cpu_has_neon(void);
+int ggml_cpu_has_f16c(void);
 int ggml_cpu_has_fp16_va(void);
 int ggml_cpu_has_wasm_simd(void);
 int ggml_cpu_has_blas(void);
--- a/models/README.md
+++ b/models/README.md
@ -1,10 +1,13 @@
 ## Whisper model files in custom ggml format

 The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
-have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed using the
-[convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate the `ggml` files
-yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the
-already converted models from https://ggml.ggerganov.com
+have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed
+using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate
+the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
+script to download the already converted models. Currently, they are hosted on the following locations:
+
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://ggml.ggerganov.com

 Sample usage:

@ -34,9 +37,29 @@ https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
 | small.en  | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
 | medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
-| large     | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large-v1  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large     | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

 ## Model files for testing purposes

-The model files pefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for testing purposes.
-They are directly included in this repository for convenience and the Github Actions CI uses them to run various sanitizer tests.
+The model files prefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for
+testing purposes. They are directly included in this repository for convenience and the Github Actions CI uses them to
+run various sanitizer tests.
+
+## Fine-tuned models
+
+There are community efforts for creating fine-tuned Whisper models using extra training data. For example, this
+[blog post](https://huggingface.co/blog/fine-tune-whisper) describes a method for fine-tuning using Hugging Face (HF)
+Transformer implementation of Whisper. The produced models are in slightly different format compared to the original
+OpenAI format. To read the HF models you can use the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script like this:
+
+```bash
+git clone https://github.com/openai/whisper
+git clone https://github.com/ggerganov/whisper.cpp
+
+# clone HF fine-tuned model (this is just an example)
+git clone https://huggingface.co/openai/whisper-base.en
+
+# convert the model to ggml
+python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
+```
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -0,0 +1,212 @@
+# Convert Hugging Face fine-tuned models to ggml format
+#
+# Usage:
+#
+#   git clone https://github.com/openai/whisper
+#   git clone https://github.com/ggerganov/whisper.cpp
+#   git clone https://huggingface.co/openai/whisper-medium
+#
+#   python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
+#
+# This script is similar to "convert-pt-to-ggml.py"
+#
+# For more info:
+#
+#   https://github.com/ggerganov/whisper.cpp/issues/157
+#
+
+import io
+import os
+import sys
+import struct
+import json
+import code
+import torch
+import numpy as np
+
+from transformers import WhisperForConditionalGeneration
+
+conv_map = {
+        'self_attn.k_proj'              : 'attn.key',
+        'self_attn.q_proj'              : 'attn.query',
+        'self_attn.v_proj'              : 'attn.value',
+        'self_attn.out_proj'            : 'attn.out',
+        'self_attn_layer_norm'          : 'attn_ln',
+        'encoder_attn.q_proj'           : 'cross_attn.query',
+        'encoder_attn.v_proj'           : 'cross_attn.value',
+        'encoder_attn.out_proj'         : 'cross_attn.out',
+        'encoder_attn_layer_norm'       : 'cross_attn_ln',
+        'fc1'                           : 'mlp.0',
+        'fc2'                           : 'mlp.2',
+        'final_layer_norm'              : 'mlp_ln',
+        'encoder.layer_norm.bias'       : 'encoder.ln_post.bias',
+        'encoder.layer_norm.weight'     : 'encoder.ln_post.weight',
+        'encoder.embed_positions.weight': 'encoder.positional_embedding',
+        'decoder.layer_norm.bias'       : 'decoder.ln.bias',
+        'decoder.layer_norm.weight'     : 'decoder.ln.weight',
+        'decoder.embed_positions.weight': 'decoder.positional_embedding',
+        'decoder.embed_tokens.weight'   : 'decoder.token_embedding.weight',
+        'proj_out.weight'               : 'decoder.proj.weight',
+        }
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 4:
+    print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
+    sys.exit(1)
+
+dir_model   = sys.argv[1]
+dir_whisper = sys.argv[2]
+dir_out     = sys.argv[3]
+
+with open(dir_model + "/vocab.json", "r") as f:
+    encoder = json.load(f)
+with open(dir_model + "/added_tokens.json", "r") as f:
+    encoder_added = json.load(f)
+with open(dir_model + "/config.json", "r") as f:
+    hparams = json.load(f)
+
+model = WhisperForConditionalGeneration.from_pretrained(dir_model)
+
+#code.interact(local=locals())
+
+n_mels = hparams["num_mel_bins"]
+with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
+    filters = torch.from_numpy(f[f"mel_{n_mels}"])
+
+dir_tokenizer = dir_model
+
+fname_out = dir_out + "/ggml-model.bin"
+
+with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
+    tokens = json.load(f)
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 4:
+    use_f16 = False
+    fname_out = dir_out + "/ggml-model-f32.bin"
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["max_source_positions"]))
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["encoder_attention_heads"]))
+fout.write(struct.pack("i", hparams["encoder_layers"]))
+fout.write(struct.pack("i", hparams["max_length"]))
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["decoder_attention_heads"]))
+fout.write(struct.pack("i", hparams["decoder_layers"]))
+fout.write(struct.pack("i", hparams["num_mel_bins"]))
+fout.write(struct.pack("i", use_f16))
+
+fout.write(struct.pack("i", filters.shape[0]))
+fout.write(struct.pack("i", filters.shape[1]))
+for i in range(filters.shape[0]):
+    for j in range(filters.shape[1]):
+        fout.write(struct.pack("f", filters[i][j]))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(tokens)))
+
+tokens = sorted(tokens.items(), key=lambda x: x[1])
+for key in tokens:
+    text = bytearray([byte_decoder[c] for c in key[0]])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+    # this seems to not be used
+    # ref: https://github.com/huggingface/transformers/blob/9a5b84a0076a04fe9596da72e8668069d4f09ea0/src/transformers/models/whisper/modeling_whisper.py#L1099-L1106
+    if name == "proj_out.weight":
+        print('Skipping', name)
+        continue
+
+    src = name
+
+    nn = name
+    if name != "proj_out.weight":
+        nn = nn.split(".")[1:]
+    else:
+        nn = nn.split(".")
+
+    if nn[1] == "layers":
+        nn[1] = "blocks"
+        if ".".join(nn[3:-1]) == "encoder_attn.k_proj":
+            mapped = "attn.key" if nn[0] == "encoder" else "cross_attn.key"
+        else:
+            mapped = conv_map[".".join(nn[3:-1])]
+        name = ".".join(nn[:3] + [mapped] + nn[-1:])
+    else:
+        name = ".".join(nn)
+        name = conv_map[name] if name in conv_map else name
+
+    print(src, ' -> ', name)
+    data = list_vars[src].squeeze().numpy()
+    data = data.astype(np.float16)
+
+    # reshape conv bias from [n] to [n, 1]
+    if name == "encoder.conv1.bias" or \
+       name == "encoder.conv2.bias":
+        data = data.reshape(data.shape[0], 1)
+        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+
+    n_dims = len(data.shape)
+    print(name, n_dims, data.shape)
+
+    # looks like the whisper models are in f16 by default
+    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 1;
+    if use_f16:
+        if n_dims < 2 or \
+                name == "encoder.conv1.bias"   or \
+                name == "encoder.conv2.bias"   or \
+                name == "encoder.positional_embedding" or \
+                name == "decoder.positional_embedding":
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+    else:
+        data = data.astype(np.float32)
+        ftype = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@ -40,131 +40,131 @@ import code
 import torch
 import numpy as np

-from transformers import GPTJForCausalLM
-from transformers import GPT2TokenizerFast
+#from transformers import GPTJForCausalLM
+#from transformers import GPT2TokenizerFast

 # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "iw": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
-}
+#LANGUAGES = {
+#    "en": "english",
+#    "zh": "chinese",
+#    "de": "german",
+#    "es": "spanish",
+#    "ru": "russian",
+#    "ko": "korean",
+#    "fr": "french",
+#    "ja": "japanese",
+#    "pt": "portuguese",
+#    "tr": "turkish",
+#    "pl": "polish",
+#    "ca": "catalan",
+#    "nl": "dutch",
+#    "ar": "arabic",
+#    "sv": "swedish",
+#    "it": "italian",
+#    "id": "indonesian",
+#    "hi": "hindi",
+#    "fi": "finnish",
+#    "vi": "vietnamese",
+#    "iw": "hebrew",
+#    "uk": "ukrainian",
+#    "el": "greek",
+#    "ms": "malay",
+#    "cs": "czech",
+#    "ro": "romanian",
+#    "da": "danish",
+#    "hu": "hungarian",
+#    "ta": "tamil",
+#    "no": "norwegian",
+#    "th": "thai",
+#    "ur": "urdu",
+#    "hr": "croatian",
+#    "bg": "bulgarian",
+#    "lt": "lithuanian",
+#    "la": "latin",
+#    "mi": "maori",
+#    "ml": "malayalam",
+#    "cy": "welsh",
+#    "sk": "slovak",
+#    "te": "telugu",
+#    "fa": "persian",
+#    "lv": "latvian",
+#    "bn": "bengali",
+#    "sr": "serbian",
+#    "az": "azerbaijani",
+#    "sl": "slovenian",
+#    "kn": "kannada",
+#    "et": "estonian",
+#    "mk": "macedonian",
+#    "br": "breton",
+#    "eu": "basque",
+#    "is": "icelandic",
+#    "hy": "armenian",
+#    "ne": "nepali",
+#    "mn": "mongolian",
+#    "bs": "bosnian",
+#    "kk": "kazakh",
+#    "sq": "albanian",
+#    "sw": "swahili",
+#    "gl": "galician",
+#    "mr": "marathi",
+#    "pa": "punjabi",
+#    "si": "sinhala",
+#    "km": "khmer",
+#    "sn": "shona",
+#    "yo": "yoruba",
+#    "so": "somali",
+#    "af": "afrikaans",
+#    "oc": "occitan",
+#    "ka": "georgian",
+#    "be": "belarusian",
+#    "tg": "tajik",
+#    "sd": "sindhi",
+#    "gu": "gujarati",
+#    "am": "amharic",
+#    "yi": "yiddish",
+#    "lo": "lao",
+#    "uz": "uzbek",
+#    "fo": "faroese",
+#    "ht": "haitian creole",
+#    "ps": "pashto",
+#    "tk": "turkmen",
+#    "nn": "nynorsk",
+#    "mt": "maltese",
+#    "sa": "sanskrit",
+#    "lb": "luxembourgish",
+#    "my": "myanmar",
+#    "bo": "tibetan",
+#    "tl": "tagalog",
+#    "mg": "malagasy",
+#    "as": "assamese",
+#    "tt": "tatar",
+#    "haw": "hawaiian",
+#    "ln": "lingala",
+#    "ha": "hausa",
+#    "ba": "bashkir",
+#    "jw": "javanese",
+#    "su": "sundanese",
+#}

-# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
-def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
-    tokenizer = GPT2TokenizerFast.from_pretrained(path)
-
-    specials = [
-        "<|startoftranscript|>",
-        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
-        "<|translate|>",
-        "<|transcribe|>",
-        "<|startoflm|>",
-        "<|startofprev|>",
-        "<|nocaptions|>",
-        "<|notimestamps|>",
-    ]
-
-    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
-    return tokenizer
+## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
+#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
+#    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
+#    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+#
+#    specials = [
+#        "<|startoftranscript|>",
+#        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+#        "<|translate|>",
+#        "<|transcribe|>",
+#        "<|startoflm|>",
+#        "<|startofprev|>",
+#        "<|nocaptions|>",
+#        "<|notimestamps|>",
+#    ]
+#
+#    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+#    return tokenizer

 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
 #code.interact(local=locals())

 multilingual = hparams["n_vocab"] == 51865
-tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
+dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")

+#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
 #print(tokenizer)
 #print(tokenizer.name_or_path)
 #print(len(tokenizer.additional_special_tokens))
-dir_tokenizer = tokenizer.name_or_path

 # output in the same directory as the model
 fname_out = dir_out + "/ggml-model.bin"
@ -297,8 +297,6 @@ for name in list_vars.keys():
                name == "encoder.conv2.bias"   or \
                name == "encoder.positional_embedding" or \
                name == "decoder.positional_embedding":
-            ftype = 0
-            data = data.astype(np.float32)
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype = 0
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -7,7 +7,7 @@ popd
 set argc=0
 for %%x in (%*) do set /A argc+=1

-set models=tiny.en tiny base.en base small.en small medium.en medium large
+set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large

 if %argc% neq 1 (
  echo.
@ -18,7 +18,7 @@ if %argc% neq 1 (

 set model=%1

-for %%b in (%models%) do ( 
+for %%b in (%models%) do (
  if "%%b"=="%model%" (
    CALL :download_model
    goto :eof
@ -41,7 +41,7 @@ if exist "ggml-%model%.bin" (

 PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://ggml.ggerganov.com/ggml-model-whisper-%model%.bin -OutFile ggml-%model%.bin"

-if %ERRORLEVEL% neq 0 ( 
+if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
  echo Please try again later or download the original Whisper model files and convert them yourself.
  goto :eof
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -3,6 +3,12 @@
 # This script downloads Whisper model files that have already been converted to ggml format.
 # This way you don't have to convert them yourself.

+#src="https://ggml.ggerganov.com"
+#pfx="ggml-model-whisper"
+
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
+pfx="resolve/main/ggml"
+
 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
@ -16,7 +22,7 @@ function get_script_path() {
 models_path=$(get_script_path)

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )

 # list available models
 function list_models {
@ -46,7 +52,7 @@ fi

 # download ggml model

-printf "Downloading ggml model $model ...\n"
+printf "Downloading ggml model $model from '$src' ...\n"

 cd $models_path

@ -56,9 +62,9 @@ if [ -f "ggml-$model.bin" ]; then
 fi

 if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O ggml-$model.bin https://ggml.ggerganov.com/ggml-model-whisper-$model.bin
+    wget --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
 elif [ -x "$(command -v curl)" ]; then
-    curl --output ggml-$model.bin https://ggml.ggerganov.com/ggml-model-whisper-$model.bin
+    curl -L --output ggml-$model.bin $src/$pfx-$model.bin
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -0,0 +1,3 @@
+*.wav
+*.ogg
+*.wav.txt
--- a/tests/en-0-ref.txt
+++ b/tests/en-0-ref.txt
@ -0,0 +1 @@
+ My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]
--- a/tests/en-1-ref.txt
+++ b/tests/en-1-ref.txt
@ -0,0 +1 @@
+ Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.
--- a/tests/en-2-ref.txt
+++ b/tests/en-2-ref.txt
@ -0,0 +1 @@
+ This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.
--- a/tests/es-0-ref.txt
+++ b/tests/es-0-ref.txt
@ -0,0 +1 @@
+ Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -0,0 +1,125 @@
+#!/bin/bash
+
+# This scripts run the selected model agains a collection of audio files from the web.
+# It downloads, converts and transcribes each file and then compares the result with the expected reference
+# transcription. The comparison is performed using git's diff command and shows the differences at the character level.
+# It can be used to quickly verify that the model is working as expected across a wide range of audio files.
+# I.e. like an integration test. The verification is done by visual inspection of the diff output.
+#
+# The reference data can be for example generated using the original OpenAI Whisper implementation, or entered manually.
+#
+# Feel free to suggest extra audio files to add to the list.
+# Make sure they are between 1-3 minutes long since we don't want to make the test too slow.
+#
+# Usage:
+#
+#   ./tests/run-tests.sh <model_name>
+#
+
+cd `dirname $0`
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ $# -eq 0 ]; then
+    printf "Usage: $0 [model]\n\n"
+    printf "No model specified. Aborting\n"
+    list_models
+    exit 1
+fi
+
+model=$1
+main="../main"
+
+if [ ! -f ../models/ggml-$model.bin ]; then
+    printf "Model $model not found. Aborting\n"
+    list_models
+    exit 1
+fi
+
+if [ ! -f $main ]; then
+    printf "Executable $main not found. Aborting\n"
+    exit 1
+fi
+
+# add various audio files for testing purposes here
+# the order of the files is important so don't change the existing order
+# when adding new files, make sure to add the expected "ref.txt" file with the correct transcript
+urls_en=(
+    "https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg"
+    "https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg"
+    "https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
+)
+
+urls_es=(
+    "https://upload.wikimedia.org/wikipedia/commons/c/c1/La_contaminacion_del_agua.ogg"
+)
+
+urls_it=(
+)
+
+urls_pt=(
+)
+
+urls_de=(
+)
+
+urls_jp=(
+)
+
+urls_ru=(
+)
+
+function run_lang() {
+    lang=$1
+    shift
+    urls=("$@")
+
+    i=0
+    for url in "${urls[@]}"; do
+        echo "- [$lang] Processing '$url' ..."
+
+        ext="${url##*.}"
+        fname_src="$lang-${i}.${ext}"
+        fname_dst="$lang-${i}-16khz.wav"
+
+        if [ ! -f $fname_src ]; then
+            wget --quiet --show-progress -O $fname_src $url
+        fi
+
+        if [ ! -f $fname_dst ]; then
+            ffmpeg -loglevel -0 -y -i $fname_src -ar 16000 -ac 1 -c:a pcm_s16le $fname_dst
+            if [ $? -ne 0 ]; then
+                echo "Error: ffmpeg failed to convert $fname_src to $fname_dst"
+                exit 1
+            fi
+        fi
+
+        $main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
+
+        git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt
+
+        i=$(($i+1))
+    done
+}
+
+run_lang "en" "${urls_en[@]}"
+
+if [[ $model != *.en ]]; then
+    run_lang "es" "${urls_es[@]}"
+    run_lang "it" "${urls_it[@]}"
+    run_lang "pt" "${urls_pt[@]}"
+    run_lang "de" "${urls_de[@]}"
+    run_lang "jp" "${urls_jp[@]}"
+    run_lang "ru" "${urls_ru[@]}"
+fi
--- a/whisper.cpp
+++ b/whisper.cpp
@ -424,6 +424,9 @@ struct whisper_context {
    int64_t t_last;
    whisper_token tid_last;
    std::vector<float> energy; // PCM signal energy
+
+    // [EXPERIMENTAL] speed-up techniques
+    int32_t exp_n_audio_ctx; // 0 - use default
 };

 // load the model from a ggml file
@ -515,15 +518,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
        wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
        wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-                   wctx.buf_model->size() +
-                   wctx.buf_memory.size() +
-                   wctx.buf_compute.size() +
-                   wctx.buf_compute_layer.size();
-
-        fprintf(stderr, "%s: mem_required  = %.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
    }

    // load mel filters
@ -596,11 +590,21 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        }
    }

+    {
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+                   wctx.buf_model->size() +
+                   wctx.buf_memory.size() +
+                   wctx.buf_compute.size() +
+                   wctx.buf_compute_layer.size();
+
+        fprintf(stderr, "%s: mem_required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
+    }
+
    // for the big tensors, we have the option to store the data in 16-bit floats
    // in order to save memory and also to speed up the computation
    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;

-
    size_t ctx_size = 0;
    size_t ctx_mem_size = 0;

@ -613,7 +617,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        const int n_audio_state = hparams.n_audio_state;
        const int n_audio_layer = hparams.n_audio_layer;

-        const int n_text_ctx = hparams.n_text_ctx;
+        const int n_text_ctx   = hparams.n_text_ctx;
        const int n_text_state = hparams.n_text_state;
        const int n_text_layer = hparams.n_text_layer;

@ -719,7 +723,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead

-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
@ -748,7 +752,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        const int n_audio_state = hparams.n_audio_state;
        const int n_audio_layer = hparams.n_audio_layer;

-        const int n_text_ctx = hparams.n_text_ctx;
+        const int n_text_ctx   = hparams.n_text_ctx;
        const int n_text_state = hparams.n_text_state;
        const int n_text_layer = hparams.n_text_layer;

@ -967,7 +971,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

        // key/value memory for the cross-attention layer
        {
-            const int n_audio_ctx   = hparams.n_audio_ctx;
+            const int n_audio_ctx = hparams.n_audio_ctx;

            const int n_mem      = n_text_layer*n_audio_ctx;
            const int n_elements = n_text_state*n_mem;
@ -980,7 +984,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
            ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);

-        fprintf(stderr, "%s: memory size = %8.2f MB\n", __func__, memory_size/1024.0/1024.0);
+        fprintf(stderr, "%s: memory size   = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
    }

    // load weights
@ -1039,12 +1043,12 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            model.n_loaded++;
        }

-        fprintf(stderr, "%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+        fprintf(stderr, "%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);

        if (model.n_loaded == 0) {
            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
@ -1076,13 +1080,11 @@ static bool whisper_encode(
    const auto & mel_inp = wctx.mel;
    const auto & hparams = model.hparams;

-    const int n_ctx   = hparams.n_audio_ctx;
+    const int n_ctx   = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
    const int n_state = hparams.n_audio_state;
    const int n_head  = hparams.n_audio_head;
    const int n_layer = hparams.n_audio_layer;

-    const int N = n_ctx;
-
    const int n_mels = hparams.n_mels;
    assert(mel_inp.n_mel == n_mels);

@ -1132,7 +1134,30 @@ static bool whisper_encode(
        cur = ggml_gelu(ctx0, cur);
    }

-    cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
+    // ===================================================================
+    // NOTE: experimenting with partial evaluation of the encoder (ignore)
+    //static int iter = -1;
+    //const int n_iter = 1500/n_ctx;
+
+    //iter = (iter + 1) % n_iter;
+
+    //if (iter == 0) {
+    //    memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
+    //    memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
+    //}
+
+    static int iter = 0;
+
+    const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
+    const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
+
+    struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
+
+    cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
+    // ===================================================================
+
+    // original:
+    //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));

    struct ggml_tensor * inpL = cur;

@ -1198,14 +1223,14 @@ static bool whisper_encode(
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            struct ggml_tensor * K =
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            struct ggml_tensor * V =
@ -1213,9 +1238,9 @@ static bool whisper_encode(
                        ggml_permute(ctxL,
                            ggml_reshape_3d(ctxL,
                                Vcur,
-                                n_state/n_head, n_head, N),
+                                n_state/n_head, n_head, n_ctx),
                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, N, n_state/n_head, n_head)
+                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head)
                        );

            struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
@ -1224,14 +1249,14 @@ static bool whisper_encode(
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            struct ggml_tensor * K =
                ggml_permute(ctxL,
                        ggml_cpy(ctxL,
                            Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
+                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
                        0, 2, 1, 3);

            // K * Q
@ -1249,7 +1274,7 @@ static bool whisper_encode(
            //    ggml_permute(ctxL,
            //            ggml_cpy(ctxL,
            //                Vcur,
-            //                ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
+            //                ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
            //            1, 2, 0, 3);

            //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
@ -1259,9 +1284,9 @@ static bool whisper_encode(
                        ggml_permute(ctxL,
                            ggml_reshape_3d(ctxL,
                                Vcur,
-                                n_state/n_head, n_head, N),
+                                n_state/n_head, n_head, n_ctx),
                            0, 2, 1, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, N, n_head)
+                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head)
                        );

            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
@ -1271,7 +1296,7 @@ static bool whisper_encode(

            cur = ggml_cpy(ctxL,
                    KQV_merged,
-                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
+                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, n_ctx));
        }

        // projection
@ -1425,6 +1450,8 @@ static bool whisper_encode(
                        Vcross),
                    Vcross);

+            //struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
+            //struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
            struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx));
            struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx));

@ -1474,7 +1501,7 @@ static bool whisper_decode(
    const int n_layer = hparams.n_text_layer;

    const int N = n_tokens;
-    const int M = hparams.n_audio_ctx;
+    const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;

    struct ggml_init_params params = {
            .mem_size   = wctx.buf_compute.size(),
@ -1819,7 +1846,9 @@ static bool whisper_decode(
 // the most basic sampling scheme - select the top token
 static whisper_token_data whisper_sample_best(
        const whisper_vocab & vocab,
-        const float * probs) {
+        const float * probs,
+              bool force_timestamp,
+              bool is_initial) {
    whisper_token_data result = {
        0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
    };
@ -1842,7 +1871,18 @@ static whisper_token_data whisper_sample_best(
            max_tx = std::max(max_tx, probs_id[i].first);
        }

-        for (int i = vocab.token_beg; i < n_logits; i++) {
+        const auto i0 = is_initial ? vocab.token_beg + 101 : vocab.token_beg;
+        const auto i1 = is_initial ? vocab.token_beg + 101 : n_logits;
+
+        // the initial timestamp cannot be larger than 100
+        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
+        if (is_initial) {
+            for (int i = i0; i < n_logits; ++ i) {
+                probs_id[i].first = -INFINITY;
+            }
+        }
+
+        for (int i = vocab.token_beg; i < i1; i++) {
            sum_ts += probs_id[i].first;
            if  (probs_id[i].first > max_ts) {
                max_ts = probs_id[i].first;
@ -1852,7 +1892,7 @@ static whisper_token_data whisper_sample_best(

        // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a
        // timestamp token
-        if (sum_ts > max_tx) {
+        if (sum_ts > max_tx || force_timestamp) {
            // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438
            for (int i = 0; i < vocab.token_beg; i++) {
                probs_id[i].first = -INFINITY;
@ -1894,39 +1934,6 @@ static whisper_token_data whisper_sample_best(
    return result;
 }

-// samples only from the timestamps tokens
-static whisper_vocab::id whisper_sample_timestamp(
-        const whisper_vocab & vocab,
-        const float * probs) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, whisper_vocab::id>> probs_id;
-    probs_id.reserve(n_logits);
-
-    for (int i = vocab.token_beg + 1; i < n_logits; i++) {
-        probs_id.push_back(std::make_pair(probs[i], i));
-    }
-
-    const int top_k = 10;
-
-    // find the top K tokens
-    std::partial_sort(
-            probs_id.begin(),
-            probs_id.begin() + top_k, probs_id.end(),
-            [](const std::pair<double, whisper_vocab::id> & a, const std::pair<double, whisper_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    probs_id.resize(top_k);
-
-    //printf("\n");
-    //for (int i = 0; i < (int) probs_id.size(); i++) {
-    //    printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second);
-    //}
-
-    return probs_id[0].second;
-}
-
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 static std::string to_timestamp(int64_t t, bool comma = false) {
@ -2031,6 +2038,7 @@ static bool log_mel_spectrogram(
    const int n_mel,
    const int n_threads,
    const whisper_filters & filters,
+    const bool speed_up,
    whisper_mel & mel) {

    // Hanning window
@ -2044,7 +2052,7 @@ static bool log_mel_spectrogram(
    mel.n_len = (n_samples)/fft_step;
    mel.data.resize(mel.n_mel*mel.n_len);

-    const int n_fft = 1 + fft_size/2;
+    const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);

    //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
    //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
@ -2091,6 +2099,13 @@ static bool log_mel_spectrogram(
                    //}
                }

+                if (speed_up) {
+                    // scale down in the frequency domain results in a speed up in the time domain
+                    for (int j = 0; j < n_fft; j++) {
+                        fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
+                    }
+                }
+
                // mel spectrogram
                for (int j = 0; j < mel.n_mel; j++) {
                    double sum = 0.0;
@ -2161,6 +2176,12 @@ struct whisper_context * whisper_init(const char * path_model) {

 void whisper_free(struct whisper_context * ctx) {
    if (ctx) {
+        if (ctx->model.ctx) {
+            ggml_free(ctx->model.ctx);
+        }
+        if (ctx->model.ctx_mem) {
+            ggml_free(ctx->model.ctx_mem);
+        }
        if (ctx->buf_model) {
            delete ctx->buf_model;
        }
@ -2171,7 +2192,21 @@ void whisper_free(struct whisper_context * ctx) {
 int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
    const int64_t t_start_us = ggml_time_us();

-    if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, ctx->mel)) {
+    if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
+        fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
+        return -1;
+    }
+
+    ctx->t_mel_us = ggml_time_us() - t_start_us;
+
+    return 0;
+}
+
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
+int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
+    const int64_t t_start_us = ggml_time_us();
+
+    if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
        fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
        return -1;
    }
@ -2229,19 +2264,17 @@ int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, i
 struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) {
    const int64_t t_start_sample_us = ggml_time_us();

-    // TODO: simplify
-    auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
+    const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), false, false);

    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;

    return res;
 }

-whisper_token whisper_sample_timestamp(struct whisper_context * ctx) {
+struct whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial) {
    const int64_t t_start_sample_us = ggml_time_us();

-    // TODO: simplify
-    auto res = whisper_sample_timestamp(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab));
+    const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), true, is_initial);

    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;

@ -2305,11 +2338,11 @@ whisper_token whisper_token_beg(struct whisper_context * ctx) {
    return ctx->vocab.token_beg;
 }

-whisper_token whisper_token_translate() {
+whisper_token whisper_token_translate(void) {
    return whisper_vocab::token_translate;
 }

-whisper_token whisper_token_transcribe() {
+whisper_token whisper_token_transcribe(void) {
    return whisper_vocab::token_transcribe;
 }

@ -2325,6 +2358,28 @@ void whisper_print_timings(struct whisper_context * ctx) {
    fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
 }

+void whisper_reset_timings(struct whisper_context * ctx) {
+    ctx->t_sample_us = 0;
+    ctx->t_encode_us = 0;
+    ctx->t_decode_us = 0;
+}
+
+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+
+    return s.c_str();
+}
+
 ////////////////////////////////////////////////////////////////////////////

 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2334,75 +2389,99 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        case WHISPER_SAMPLING_GREEDY:
            {
                result = {
-                    /*.strategy             =*/ WHISPER_SAMPLING_GREEDY,
+                    /*.strategy         =*/ WHISPER_SAMPLING_GREEDY,

-                    /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-                    /*.n_max_text_ctx       =*/ 16384,
-                    /*.offset_ms            =*/ 0,
+                    /*.n_threads        =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+                    /*.n_max_text_ctx   =*/ 16384,
+                    /*.offset_ms        =*/ 0,
+                    /*.duration_ms      =*/ 0,

-                    /*.translate            =*/ false,
-                    /*.no_context           =*/ false,
-                    /*.print_special_tokens =*/ false,
-                    /*.print_progress       =*/ true,
-                    /*.print_realtime       =*/ false,
-                    /*.print_timestamps     =*/ true,
+                    /*.translate        =*/ false,
+                    /*.no_context       =*/ false,
+                    /*.single_segment   =*/ false,
+                    /*.print_special    =*/ false,
+                    /*.print_progress   =*/ true,
+                    /*.print_realtime   =*/ false,
+                    /*.print_timestamps =*/ true,

-                    /*.token_timestamps     =*/ false,
-                    /*.thold_pt             =*/ 0.01f,
-                    /*.thold_ptsum          =*/ 0.01f,
-                    /*.max_len              =*/ 0,
+                    /*.token_timestamps =*/ false,
+                    /*.thold_pt         =*/ 0.01f,
+                    /*.thold_ptsum      =*/ 0.01f,
+                    /*.max_len          =*/ 0,
+                    /*.max_tokens       =*/ 0,

-                    /*.language             =*/ "en",
+                    /*.speed_up         =*/ false,
+                    /*.audio_ctx        =*/ 0,

-                    /*.greedy               =*/ {
+                    /*.prompt_tokens    =*/ nullptr,
+                    /*.prompt_n_tokens  =*/ 0,
+
+                    /*.language         =*/ "en",
+
+                    /*.greedy           =*/ {
                        /*.n_past =*/ 0,
                    },

-                    /*.beam_search          =*/ {
+                    /*.beam_search      =*/ {
                        /*.n_past     =*/ -1,
                        /*.beam_width =*/ -1,
                        /*.n_best     =*/ -1,
                    },

-                    /*.new_segment_callback =*/ nullptr,
+                    /*.new_segment_callback           =*/ nullptr,
                    /*.new_segment_callback_user_data =*/ nullptr,
+
+                    /*.encoder_begin_callback           =*/ nullptr,
+                    /*.encoder_begin_callback_user_data =*/ nullptr,
                };
            } break;
        case WHISPER_SAMPLING_BEAM_SEARCH:
            {
                result = {
-                    /*.strategy             =*/ WHISPER_SAMPLING_BEAM_SEARCH,
+                    /*.strategy         =*/ WHISPER_SAMPLING_BEAM_SEARCH,

-                    /*.n_threads            =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-                    /*.n_max_text_ctx       =*/ 16384,
-                    /*.offset_ms            =*/ 0,
+                    /*.n_threads        =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+                    /*.n_max_text_ctx   =*/ 16384,
+                    /*.offset_ms        =*/ 0,
+                    /*.duration_ms      =*/ 0,

-                    /*.translate            =*/ false,
-                    /*.no_context           =*/ false,
-                    /*.print_special_tokens =*/ false,
-                    /*.print_progress       =*/ true,
-                    /*.print_realtime       =*/ false,
-                    /*.print_timestamps     =*/ true,
+                    /*.translate        =*/ false,
+                    /*.no_context       =*/ false,
+                    /*.single_segment   =*/ false,
+                    /*.print_special    =*/ false,
+                    /*.print_progress   =*/ true,
+                    /*.print_realtime   =*/ false,
+                    /*.print_timestamps =*/ true,

-                    /*.token_timestamps     =*/ false,
-                    /*.thold_pt             =*/ 0.01f,
-                    /*.thold_ptsum          =*/ 0.01f,
-                    /*.max_len              =*/ 0,
+                    /*.token_timestamps =*/ false,
+                    /*.thold_pt         =*/ 0.01f,
+                    /*.thold_ptsum      =*/ 0.01f,
+                    /*.max_len          =*/ 0,
+                    /*.max_tokens       =*/ 0,

-                    /*.language             =*/ "en",
+                    /*.speed_up         =*/ false,
+                    /*.audio_ctx        =*/ 0,

-                    /*.greedy               =*/ {
+                    /*.prompt_tokens    =*/ nullptr,
+                    /*.prompt_n_tokens  =*/ 0,
+
+                    /*.language         =*/ "en",
+
+                    /*.greedy           =*/ {
                        /*.n_past =*/ -1,
                    },

-                    /*.beam_search          =*/ {
+                    /*.beam_search      =*/ {
                        /*.n_past     =*/ 0,
                        /*.beam_width =*/ 10,
                        /*.n_best     =*/ 5,
                    },

-                    /*.new_segment_callback =*/ nullptr,
+                    /*.new_segment_callback           =*/ nullptr,
                    /*.new_segment_callback_user_data =*/ nullptr,
+
+                    /*.encoder_begin_callback           =*/ nullptr,
+                    /*.encoder_begin_callback_user_data =*/ nullptr,
                };
            } break;
    }
@ -2483,9 +2562,16 @@ int whisper_full(
    result_all.clear();

    // compute log mel spectrogram
-    if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
-        fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
-        return -1;
+    if (params.speed_up) {
+        if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
+            fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
+            return -1;
+        }
+    } else {
+        if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
+            fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
+            return -1;
+        }
    }

    if (params.token_timestamps) {
@ -2496,11 +2582,12 @@ int whisper_full(
    }

    const int seek_start = params.offset_ms/10;
+    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);

    // if length of spectrogram is less than 1s (100 samples), then return
    // basically don't process anything that is less than 1s
    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (whisper_n_len(ctx) < 100 + seek_start) {
+    if (seek_end < 100 + seek_start) {
        return 0;
    }

@ -2510,6 +2597,18 @@ int whisper_full(
        prompt_past.clear();
    }

+    // prepend the prompt tokens to the prompt_past
+    if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+        // parse tokens from the pointer
+        for (int i = 0; i < params.prompt_n_tokens; i++) {
+            prompt_past.push_back(params.prompt_tokens[i]);
+        }
+        std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
+    }
+
+    // overwrite audio_ctx
+    ctx->exp_n_audio_ctx = params.audio_ctx;
+
    // these tokens determine the task that will be performed
    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
    if (whisper_is_multilingual(ctx)) {
@ -2533,7 +2632,7 @@ int whisper_full(
    // main loop
    int seek = seek_start;
    while (true) {
-        int progress_cur = (100*seek)/whisper_n_len(ctx);
+        const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
        while (progress_cur >= progress_prev + progress_step) {
            progress_prev += progress_step;
            if (params.print_progress) {
@ -2541,10 +2640,17 @@ int whisper_full(
            }
        }

-        if (seek + 100 >= whisper_n_len(ctx)) {
+        if (seek + 100 >= seek_end) {
            break;
        }

+        if (params.encoder_begin_callback) {
+            if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
+                fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
+                break;
+            }
+        }
+
        // encode audio features starting at offset seek
        if (whisper_encode(ctx, seek, params.n_threads) != 0) {
            fprintf(stderr, "%s: failed to encode\n", __func__);
@ -2567,7 +2673,6 @@ int whisper_full(

        prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());

-        bool done = false;
        int seek_delta = 100*WHISPER_CHUNK_SIZE;

        // print the prompt
@ -2581,7 +2686,9 @@ int whisper_full(
        int result_len = 0;
        tokens_cur.clear();

-        for (int i = 0; i < whisper_n_text_ctx(ctx)/2 - 4; ++i) {
+        bool failed = false;
+
+        for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
            if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
                fprintf(stderr, "%s: failed to decode\n", __func__);
                return 8;
@ -2598,15 +2705,19 @@ int whisper_full(
            // feel free to experiment!
            //
            {
-                auto token = whisper_sample_best(ctx);
-
-                if (i == 0) {
-                    token.tid = whisper_token_beg(ctx);
-                }
+                const auto token = (i == 0) ? whisper_sample_timestamp(ctx, true) : whisper_sample_best(ctx);

                // timestamp token - update sliding window
                if (token.id > whisper_token_beg(ctx)) {
-                    seek_delta = 2*(token.id - whisper_token_beg(ctx));
+                    const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
+
+                    // do not allow to go back in time
+                    if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
+                        seek_delta > seek_delta_new && result_len < i) {
+                        break;
+                    }
+
+                    seek_delta = seek_delta_new;
                    result_len = i + 1;
                }

@ -2616,19 +2727,25 @@ int whisper_full(

                //{
                //    const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token[token.tid] : "[?]";
-                //    printf("%s: %10s %6.3f '%s'\n", __func__, tt.c_str(), token.pt, ctx->vocab.id_to_token[token.id].c_str());
+                //    printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
                //}

                // end of text token
-                if (token.id == whisper_token_eot(ctx)) {
+                if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
                    if (result_len == 0) {
-                        if (seek + seek_delta + 100 >= whisper_n_len(ctx)) {
+                        if (seek + seek_delta + 100 >= seek_end) {
                            result_len = i + 1;
                        } else {
-                            // TODO: figure out how to resolve this
-                            fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__);
+                            failed = true;
+                            break;
                        }
                    }
+
+                    if (params.single_segment) {
+                        result_len = i + 1;
+                        seek_delta = 100*WHISPER_CHUNK_SIZE;
+                    }
+
                    break;
                }

@ -2639,11 +2756,21 @@ int whisper_full(
                }
            }

-            if (done) {
+            // sometimes, the decoding can get stuck in a repetition loop
+            // this is a simple strategy to avoid such cases - we simply flag the decoding as failed and advance
+            // the sliding window by 1 second
+            if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
+                failed = true;
                break;
            }
        }

+        if (failed) {
+            fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__);
+            seek += 100;
+            continue;
+        }
+
        // shrink down to result_len
        tokens_cur.resize(result_len);

@ -2663,23 +2790,26 @@ int whisper_full(
                //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
                //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);

-                if (params.print_special_tokens == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
+                if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
                } else {
                    text += whisper_token_to_str(ctx, tokens_cur[i].id);
                }
-                if (tokens_cur[i].id > whisper_token_beg(ctx)) {
+                if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
                    const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
                    if (!text.empty()) {
+                        const auto tt0 = params.speed_up ? 2*t0 : t0;
+                        const auto tt1 = params.speed_up ? 2*t1 : t1;
+
                        if (params.print_realtime) {
                            if (params.print_timestamps) {
-                                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
+                                printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
                            } else {
                                printf("%s", text.c_str());
                                fflush(stdout);
                            }
                        }

-                        result_all.push_back({ t0, t1, text, {} });
+                        result_all.push_back({ tt0, tt1, text, {} });
                        for (int j = i0; j <= i; j++) {
                            result_all.back().tokens.push_back(tokens_cur[j]);
                        }
@ -2711,16 +2841,19 @@ int whisper_full(
            if (!text.empty()) {
                const auto t1 = seek + seek_delta;

+                const auto tt0 = params.speed_up ? 2*t0 : t0;
+                const auto tt1 = params.speed_up ? 2*t1 : t1;
+
                if (params.print_realtime) {
                    if (params.print_timestamps) {
-                        printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
+                        printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
                    } else {
                        printf("%s", text.c_str());
                        fflush(stdout);
                    }
                }

-                result_all.push_back({ t0, t1, text, {} });
+                result_all.push_back({ tt0, tt1, text, {} });
                for (int j = i0; j < (int) tokens_cur.size(); j++) {
                    result_all.back().tokens.push_back(tokens_cur[j]);
                }
@ -2752,7 +2885,7 @@ int whisper_full_parallel(
        struct whisper_full_params params,
        const float * samples,
        int n_samples,
-        const int n_processors) {
+        int n_processors) {
    if (n_processors == 1) {
        return whisper_full(ctx, params, samples, n_samples);
    }
@ -2802,7 +2935,7 @@ int whisper_full_parallel(

            // key/value memory for the cross-attention layer
            {
-                const int n_audio_ctx   = hparams.n_audio_ctx;
+                const int n_audio_ctx = hparams.n_audio_ctx;

                const int n_mem      = n_text_layer*n_audio_ctx;
                const int n_elements = n_text_state*n_mem;
@ -2810,10 +2943,6 @@ int whisper_full_parallel(
                model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
                model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
            }
-
-            const size_t memory_size =
-                ggml_nbytes(model.memory_k)       + ggml_nbytes(model.memory_v) +
-                ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v);
        }
    }

@ -2933,20 +3062,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
    return ctx->result_all[i_segment].tokens[i_token].p;
 }

-const char * whisper_print_system_info() {
-    static std::string s;
-
-    s  = "";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-
-    return s.c_str();
-}
-
 // =================================================================================================

 //
@ -3033,9 +3148,6 @@ static void whisper_exp_compute_token_level_timestamps(
    const int64_t t0 = segment.t0;
    const int64_t t1 = segment.t1;

-    const int s0 = timestamp_to_sample(t0, n_samples);
-    const int s1 = timestamp_to_sample(t1, n_samples);
-
    const int n = tokens.size();

    if (n == 0) {
--- a/whisper.h
+++ b/whisper.h
@ -72,16 +72,16 @@ extern "C" {
        whisper_token id;  // token id
        whisper_token tid; // forced timestamp token id

-        float p;     // probability of the token
-        float pt;    // probability of the timestamp token
-        float ptsum; // sum of probabilities of all timestamp tokens
+        float p;           // probability of the token
+        float pt;          // probability of the timestamp token
+        float ptsum;       // sum of probabilities of all timestamp tokens

        // token-level timestamp data
        // do not use if you haven't computed token-level timestamps
-        int64_t t0; // start time of the token
-        int64_t t1; //   end time of the token
+        int64_t t0;        // start time of the token
+        int64_t t1;        //   end time of the token

-        float vlen; // voice length of the token
+        float vlen;        // voice length of the token
    } whisper_token_data;

    // Allocates all memory needed for the model and loads the model from the given file.
@ -96,9 +96,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
-            const float * samples,
-            int n_samples,
-            int n_threads);
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);

    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@ -106,9 +106,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_set_mel(
            struct whisper_context * ctx,
-            const float * data,
-            int n_len,
-            int n_mel);
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);

    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@ -116,8 +116,8 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_encode(
            struct whisper_context * ctx,
-            int offset,
-            int n_threads);
+                               int   offset,
+                               int   n_threads);

    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
@ -126,10 +126,10 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
-            const whisper_token * tokens,
-            int n_tokens,
-            int n_past,
-            int n_threads);
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);

    // Token sampling methods.
    // These are provided for convenience and can be used after each call to whisper_decode().
@ -137,7 +137,7 @@ extern "C" {
    // whisper_sample_best() returns the token with the highest probability
    // whisper_sample_timestamp() returns the most probable timestamp token
    WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
+    WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial);

    // Return the id of the specified language, returns -1 if not found
    WHISPER_API int whisper_lang_id(const char * lang);
@ -162,11 +162,15 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);

    // Task tokens
-    WHISPER_API whisper_token whisper_token_translate ();
-    WHISPER_API whisper_token whisper_token_transcribe();
+    WHISPER_API whisper_token whisper_token_translate (void);
+    WHISPER_API whisper_token whisper_token_transcribe(void);

    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
+    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
+
+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);

    ////////////////////////////////////////////////////////////////////////////

@ -181,16 +185,26 @@ extern "C" {
    // Use the whisper_full_...() functions to obtain the text segments
    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);

+    // Encoder begin callback
+    // If not NULL, called before the encoder starts
+    // If it returns false, the computation is aborted
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+
+    // Parameters for the whisper_full() function
+    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
+    // whisper_full_default_params()
    struct whisper_full_params {
        enum whisper_sampling_strategy strategy;

        int n_threads;
        int n_max_text_ctx;
-        int offset_ms;
+        int offset_ms;          // start offset in ms
+        int duration_ms;        // audio duration to process in ms

        bool translate;
        bool no_context;
-        bool print_special_tokens;
+        bool single_segment;    // force single segment output (useful for streaming)
+        bool print_special;
        bool print_progress;
        bool print_realtime;
        bool print_timestamps;
@ -200,6 +214,16 @@ extern "C" {
        float thold_pt;         // timestamp token probability threshold (~0.01)
        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
        int   max_len;          // max segment length in characters
+        int   max_tokens;       // max tokens per segment (0 = no limit)
+
+        // [EXPERIMENTAL] speed-up techniques
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)
+
+        // tokens to provide the whisper model as initial prompt
+        // these are prepended to any existing text context from a previous call
+        const whisper_token * prompt_tokens;
+        int prompt_n_tokens;

        const char * language;

@ -215,6 +239,9 @@ extern "C" {

        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;
+
+        whisper_encoder_begin_callback encoder_begin_callback;
+        void * encoder_begin_callback_user_data;
    };

    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -222,20 +249,20 @@ extern "C" {
    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);

    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples,
-            const int n_processors);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples,
+                                   int   n_processors);

    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
@ -262,9 +289,6 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);

-    // Print system information
-    WHISPER_API const char * whisper_print_system_info();
-
 #ifdef __cplusplus
 }
 #endif
Author	SHA1	Message	Date
Georgi Gerganov	e0bd97f41f	ggml : use macros to inline FP16 <-> FP32 conversions	2022-12-06 22:05:33 +02:00
Georgi Gerganov	f8ec718b76	ggml : add F16C CPU flag check	2022-12-06 21:56:56 +02:00
katsu560	35b40a93b9	add fp16/fp32 convert intrinsics	2022-12-06 21:44:24 +02:00
Georgi Gerganov	9fe7306f4b	models : add the new "large" model release by OpenAI The old "large" model is now renamed "large-v1". If you have been using it, make sure to rename it and download the new "large" model for best results.	2022-12-06 18:48:57 +02:00
Georgi Gerganov	13e8eb2346	bench : add commit hash to bench-all.sh results	2022-12-06 18:47:48 +02:00
Georgi Gerganov	78d13257be	Try to improve the token sampling strategy (#193 ) * whisper : try to improve the token sampling strategy - Add the "max_initial_timestaamp" token logic from OpenAI - Disallow sampling timestamps that are in the past * whisper : fix the max initial timestamp logic + fallback decoding	2022-12-02 21:51:50 +02:00
Georgi Gerganov	9b7df68753	tests : adding transcription tests	2022-12-02 21:40:02 +02:00
Georgi Gerganov	061fc81bd6	ggml : remove inline specifier from fp16 <-> fp32 converters	2022-12-01 22:15:12 +02:00
Georgi Gerganov	57e0e6b700	livestream : handle ffmpeg errors gracefully and stabilize transcript	2022-12-01 20:49:09 +02:00
Georgi Gerganov	4f7363077f	livestream : minor changes	2022-12-01 19:47:58 +02:00
semiformal-net	093c840dee	livestream : fix losing words across audio chunk (#195 ) * improve livestream script * Update examples/livestream.sh Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Paul Edwards <paul.edwards@semiformal.net> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2022-12-01 19:18:22 +02:00
Tienshiao Ma	e7f09a0a61	Fix Darwin flags - was incorrectly always using the Linux else clause	2022-12-01 19:17:04 +02:00
Georgi Gerganov	4698dcdb52	whisper : add mechanism for aborting the whisper_full() computation	2022-11-27 20:42:45 +02:00
Georgi Gerganov	6fd5358dd0	Update README.md	2022-11-27 11:30:32 +02:00
Georgi Gerganov	164df0d447	whisper.objc : fix context + broken readme links	2022-11-27 10:52:27 +02:00
Georgi Gerganov	e266cb0723	whisper.objc : add real-time processing (#97 ) Similar to the "stream" app	2022-11-26 18:32:46 +02:00
Georgi Gerganov	c207eed431	whisper.objc : fix build warnings	2022-11-26 16:27:04 +02:00
Georgi Gerganov	67e819baf4	minor : remove "examples/" prefix from the README	2022-11-26 13:07:54 +02:00
Georgi Gerganov	a425365b82	yt-wsp.sh : script to easily transcribe VODs Thanks to @DaniruKun ref: https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818 Usage: cd whisper.cpp make ./examples/yt-wsp.sh <video-url>	2022-11-26 12:54:42 +02:00
Georgi Gerganov	e0e864d9ca	Update README.md	2022-11-26 11:56:55 +02:00
Georgi Gerganov	68ecadbbc9	command.wasm : add voice assistant example for the Web (#171 ) Same as the command-line tool "command", but runs in the browser Also, added helper script "extra/deploy-wasm.sh" and fixed some timing constants for the WASM examples.	2022-11-26 11:40:06 +02:00
Georgi Gerganov	c536ff4005	minor : add comment for using "generate_karaoke.sh"	2022-11-26 10:22:42 +02:00
Georgi Gerganov	cb70b07db5	livestream.sh : simple tool to transcribe audio livestreams (#185 )	2022-11-26 10:05:37 +02:00
Georgi Gerganov	3c390ffe38	stream.wasm : add web-based real-time transcription (#112 )	2022-11-25 23:57:46 +02:00
Georgi Gerganov	be16dfa038	whisper.wasm : do not block page while processing (close #86 )	2022-11-25 23:07:42 +02:00
Georgi Gerganov	0f619b52ce	main : add stereo-channel-based diarization (#64 ) Not tested - I don't have stereo dialog audio	2022-11-25 22:08:58 +02:00
Georgi Gerganov	1246dd023e	command : add demonstration video	2022-11-25 20:23:58 +02:00
Georgi Gerganov	0be27bbd92	command : fix build + fix README + add bold printing	2022-11-25 19:53:50 +02:00
Georgi Gerganov	bc88eb13c6	examples : add "command" tool (#171 )	2022-11-25 19:36:57 +02:00
Georgi Gerganov	b8ce25dec1	refactoring : more readable code	2022-11-25 19:28:04 +02:00
vicalloy	fd113687aa	correct model name display on running samples	2022-11-25 07:17:02 +02:00
Georgi Gerganov	e4805d9601	wasm : refactor wasm example + reuse fetch mechanism	2022-11-24 23:13:26 +02:00
Georgi Gerganov	ff36415a86	talk.wasm : update video link + some minor fixes	2022-11-24 20:15:24 +02:00
Georgi Gerganov	025ff465b6	Update README.md Use a less cringy video to demo talk.wasm lol	2022-11-24 20:09:45 +02:00
Georgi Gerganov	2c0501b38a	Update README.md	2022-11-24 20:06:51 +02:00
Georgi Gerganov	abce28ea99	talk.wasm : move to https://whisper.ggerganov.com/talk This way, we can share the same models across different WASM examples and not have to download them for each page	2022-11-24 18:24:06 +02:00
Georgi Gerganov	a2ecd54455	models : add instructions for using HF fine-tuned models	2022-11-24 17:54:41 +02:00
Georgi Gerganov	128aaadb93	whisper : improve printfs	2022-11-24 17:54:16 +02:00
Georgi Gerganov	454b91de16	main : fix dangling pointer when using stdin for input (#65 )	2022-11-24 17:53:51 +02:00
Georgi Gerganov	d7024cf9dc	main, stream : remove --verbose flag (#178 )	2022-11-24 17:52:04 +02:00
Georgi Gerganov	37422ed733	talk.wasm : add audio pre-processing + bump memory	2022-11-24 00:34:00 +02:00
Georgi Gerganov	be3b720f96	talk.wasm : refactoring + update README.md	2022-11-24 00:08:57 +02:00
Georgi Gerganov	00f46dbc1d	models : add usage comments to the HF convert script (#157 )	2022-11-23 23:22:40 +02:00
Georgi Gerganov	5698bddbc9	models : fix HF fine-tuned model conversion script (#157 ) It works now	2022-11-23 23:14:11 +02:00
Georgi Gerganov	388e9f79ad	ggml : fix the fix	2022-11-23 22:40:06 +02:00
Georgi Gerganov	35cd29ce1f	ggml : fix cross-compile Linux -> Window with mingw (#168 )	2022-11-23 22:28:41 +02:00
Georgi Gerganov	a156a358ca	Revert "update README.md" This reverts commit `6a84147113`.	2022-11-23 22:16:50 +02:00
katsu560	6a84147113	update README.md	2022-11-23 22:16:33 +02:00
katsu560	804f36aa2c	ggml: change inline ggml_fp16_to_fp32, ggml_fp16_t ggml_fp32_to_fp16	2022-11-23 22:16:33 +02:00
katsu560	4b2f51b479	add gprof option	2022-11-23 22:16:33 +02:00
katsu560	800ae5b808	fix AVX,AVX2,FMA,F16C detection on Linux and add flags for OpenBLAS	2022-11-23 22:16:33 +02:00
katsu560	83456076f0	add AVX support	2022-11-23 22:16:33 +02:00
Tamotsu Takahashi	3df6c14fca	Build with OpenBLAS and SDL2 on windows	2022-11-23 22:09:54 +02:00
Georgi Gerganov	d64d6ca3fd	models : minor changes to the HF convert script (#157 )	2022-11-23 22:07:20 +02:00
Georgi Gerganov	93482d0373	models : add "convert-h5-to-ggml.py" script (#157 ) Converts transformers models to ggml. Although the conversion is successful, it does not work for some reason. Not sure why	2022-11-23 17:19:22 +02:00
Georgi Gerganov	49706a658a	minor : updates few prints + fix buttons in whisper.wasm	2022-11-23 17:19:21 +02:00
Georgi Gerganov	363a2dadec	Update README.md	2022-11-23 09:53:55 +02:00
Georgi Gerganov	623a486056	Update README.md	2022-11-23 09:52:36 +02:00
Tamotsu Takahashi	2f596f5b33	Find libopenblas.dll.a on windows "lib" is needed for windows. With this change, you can build whisper.cpp with OpenBLAS's prebuilt DLL. 1. extract a zip from https://github.com/xianyi/OpenBLAS/releases 2. copy the headers in (openblas)/include to the root directory of whisper.cpp 3. invoke cmake with -DCMAKE_LIBRARY_PATH=(openblas)\lib -DWHISPER_SUPPORT_OPENBLAS=ON 4. copy (openblas)/bin/libopenblas.dll to the same directory of whisper.dll after msbuild https://github.com/ggerganov/whisper.cpp/issues/89#issuecomment-1324391258	2022-11-23 08:26:45 +02:00
Georgi Gerganov	e5dcdabbb8	unicode : fix character replacement (thanks to @tamo)	2022-11-23 08:24:29 +02:00
Georgi Gerganov	dad109c3f1	close #109 : add fetching of the model over HTTP (whisper.wasm)	2022-11-22 22:48:56 +02:00
Georgi Gerganov	326573de9a	talk.wasm : final touches	2022-11-22 22:22:17 +02:00
Georgi Gerganov	9aea96f774	talk.wasm : polishing + adding many AI personalities	2022-11-22 20:10:20 +02:00
Georgi Gerganov	385236d1d3	stream : "-kc" now enables context keeping from previous segment (#90 ) By default, the context keeping is disabled	2022-11-22 18:21:15 +02:00
M. Eren Akbiyik	63ae03b8e0	Prompt previous tokens for streaming (#163 ) * feat: prompt previous tokens for streaming I used a vector pointer instead of vector itself because it gave weird errors, and why not * convert vector to use with C api * feat: remove old refs, check for prompt size * feat: use better way of getting the pointer	2022-11-22 18:10:35 +02:00
Georgi Gerganov	78116f8eda	talk.wasm : update README.md	2022-11-21 22:42:29 +02:00
Georgi Gerganov	a4dfbeecf9	talk.wasm : GPT-2 meets Whisper in WebAssembly (#155 ) * talk : initial real-time transcription in the browser * talk : polishing the UI * talk : ready for beta testing * talk.wasm : rename example	2022-11-21 22:20:42 +02:00
Georgi Gerganov	2e311a2917	Update README.md	2022-11-21 18:52:20 +02:00
Georgi Gerganov	2065572a11	ggml : fix Windows build	2022-11-20 22:47:03 +02:00
Georgi Gerganov	5c2176e314	ci : add Windows build	2022-11-20 22:47:03 +02:00
Georgi Gerganov	f2df9bd768	stream : add "max_tokens" cli arg Controls the max tokens per segment for the stream example	2022-11-20 21:22:41 +02:00
Georgi Gerganov	fb8d77f760	stream : add "audio_ctx" parameter Used to overwrite the audio context size of the Encoder. For example, setting "audio_ctx = 512" will make it run about 3 times faster, processing about 10s of audio, instead of 30s. The transcription quality drops, but this can be used for real-time streaming purposes where performance is important.	2022-11-20 21:22:41 +02:00
Georgi Gerganov	62b5ff875c	stream : add "max_tokens" parameter Used to limit the number of tokens in a segment. Useful to battle with word repetition when using partial encoder context	2022-11-20 21:22:41 +02:00
Georgi Gerganov	d351771a4b	stream : add "single_segment" option Force the entire audio chunk to be transcribed into a single segment	2022-11-20 21:22:41 +02:00
Georgi Gerganov	c058aaf22e	stream : partial encoder experiments	2022-11-20 21:22:41 +02:00
greeshmay	2ba66360c9	fix: free ggml_context (close #149 ) (#150 ) * fix: free ggml_context * ggml : free the model's contexts in whisper_free() Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2022-11-17 22:12:51 +02:00
Georgi Gerganov	e70e5c8b53	models : simplify the conversion script "transformers" dependency is not actually needed	2022-11-16 19:22:32 +02:00
Dody Suria Wijaya	55a0e1a64e	Update download-ggml-model.sh follow curl redirect to new hosting site	2022-11-16 18:59:44 +02:00
Georgi Gerganov	864a78a8d0	models : change default hosting to Hugging Face My Linode is running out of monthly bandwidth due to the big interest in the project	2022-11-15 19:47:06 +02:00
Georgi Gerganov	83c742f1a7	whisper : add option to speed up the audio tempo by x2 Using a Phase Vocoder for speeding up the audio tempo by scaling down the frequencies in the frequency domain. This reduces the computation in the Encoder by a factor of 2. The transcription accuracy is degraded, but for slow to normal speech - it seems to be still very good. I think this can find application for real-time transcription - i.e. the "stream" example.	2022-11-13 16:25:43 +02:00
Georgi Gerganov	41b48ab7f1	make : add libwhisper.so target (#144 )	2022-11-13 09:09:48 +02:00
Chidi Williams	a728be9cdb	Add WHISPER_NO_AVX and WHISPER_NO_AVX2 to CMakeLists (#136 ) * Check for AVX and AVX2 on Darwin * Add AVX options to CMakeLists	2022-11-11 18:10:01 +02:00
Georgi Gerganov	46a68fb9b5	minor : remove one more redundant line	2022-11-11 18:02:58 +02:00
Georgi Gerganov	ccd56a9c5b	minor : fix double float32 conversion in python script	2022-11-11 17:58:51 +02:00
Georgi Gerganov	3500ce8727	ref #40 : start working on the documentation	2022-11-09 21:41:40 +02:00
Alan	7519eabf65	Adds support for stdin wav input	2022-11-09 20:37:23 +02:00
Georgi Gerganov	b21213c23e	js : update whipser.js to latest	2022-11-09 19:33:10 +02:00
Chidi Williams	9e700e1821	Check for AVX and AVX2 on Darwin	2022-11-09 18:49:55 +02:00
boolemancer	0bfe728b84	Fix the Windows pthread_create shim The current implementation doesn't actually set the out parameter, and it returns 0 on failure instead of on success.	2022-11-08 15:02:32 +02:00
Georgi Gerganov	4e5674a5d5	sync : submodule whisper.spm	2022-11-07 21:48:13 +02:00
Georgi Gerganov	4c66b6a828	cmake : add submodule whisper.spm	2022-11-07 20:50:24 +02:00
Georgi Gerganov	c30bffc8a5	ref #22 : add "duration" option Can be used to partially process a recording	2022-11-07 20:14:52 +02:00
Georgi Gerganov	8fdfb0ba92	Update README.md	2022-11-06 21:04:21 +02:00
Georgi Gerganov	c71363f14c	examples : add simple script for generating Karaoke video	2022-11-06 09:22:50 +02:00
				`@ -0,0 +1 @@`
				My fellow Americans, this day has brought terrible news and great sadness to our country. At 9 o'clock this morning, Mission Control in Houston lost contact with our space shuttle, Columbia. A short time later, debris was seen falling from the skies above Texas. The Colombians lost. There are no survivors. On board was a crew of seven. Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon, a colonel in the Israeli Air Force. These men and women assumed great risk in the service to all humanity. In an age when spaceflight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket and the difficulties of navigating the fierce outer atmosphere of the Earth. These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life. Because of their courage and daring and idealism, we will miss them all the more. All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief. You're not alone. Our entire nation grieves with you. And those you love will always have the respect and gratitude of this country. The cause in which they died will continue. Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand. Our journey into space will go on. In the skies today, we saw destruction and tragedy. Yet farther than we can see, there is comfort and hope. In the words of the prophet Isaiah, "Lift your eyes and look to the heavens. Who created all these? He who brings out the starry hosts one by one and calls them each by name." Because of His great power and mighty strength, not one of them is missing. The same Creator who names the stars also knows the names of the seven souls we mourn today. The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home. May God bless the grieving families. And may God continue to bless America. [Silence]
				`@ -0,0 +1 @@`
				Henry F. Phillips from Wikipedia, the free encyclopedia at en.wikipedia.org. Henry F. Phillips from Wikipedia, the free encyclopedia. Henry F. Phillips 1890-1958, a U.S. businessman from Portland, Oregon, has the honor of having the Phillips head screw and screwdriver named after him. The importance of the cross head screw design lies in its self-centering property, useful on automated production lines that use powered screwdrivers. Phillips' major contribution was in driving the cross head concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, U.S. Patent #2,046,343, U.S. Patents #2,046,837 to #2,046,840, it was so widely copied that by 1949 Phillips lost his patent. The American Screw Company was responsible for devising a means of manufacturing the screw, and successfully patented and licensed their method. Other screw makers of the 1930s dismissed the Phillips concept since it calls for a relatively complex recessed socket shape in the head of the screw, as distinct from the simple milled slot of a slotted type screw. The Phillips Screw Company and the American Screw Company went on to devise the Pawsadrive screw, which differs from the Phillips in that it is designed to accommodate greater torque than the Phillips. An image accompanied this article, captioned "Phillips Screw Head." The following is an info box which accompanies this article. Info box, part of the series on screw drive types. Slotted, commonly erroneously flat head. Phillips, cross head. Pawsadrive, super drive. Torques. Hex, Allen. Robertson. Tri-wing. Torx set. Spanner head. Triple square, XZN. Others, poly drive, spline drive, double hex. Many images accompanied this info box. This page was last modified on the 9th of April, 2008, at 1704. All text is available under the terms of the GNU Free Documentation License. See copyrights for details. Wikipedia is a registered trademark of the Wikimedia Foundation Incorporated, a U.S. registered 501(c)(3) tax-deductible nonprofit charity. This sound file and all text in the article are licensed under the GNU Free Documentation License, available at www.gnu.org/copyleft/fdl.html.
				`@ -0,0 +1 @@`
				This is the Micro Machine Man presenting the most midget miniature motorcade of Micro Machines. Each one has dramatic details, terrific trim, precision paint jobs, plus incredible Micro Machine Pocket Playsets. There's a police station, fire station, restaurant, service station, and more. Perfect pocket portables to take anyplace. And there are many miniature playsets to play with, and each one comes with its own special edition Micro Machine vehicle and fun, fantastic features that miraculously move. Raise the boat lift at the airport marina, man the gun turret at the army base, clean your car at the car wash, raise the toll bridge. And these playsets fit together to form a Micro Machine world. Micro Machine Pocket Playsets, so tremendously tiny, so perfectly precise, so dazzlingly detailed, you'll want to pocket them all. Micro Machines are Micro Machine Pocket Playsets sold separately from Galoob. The smaller they are, the better they are.
				`@ -0,0 +1 @@`
				Hola, como están todos? Mi nombre es Julián Virrueta Mendoza y en este podcast les vengo a hablar sobre la contaminación del agua. Bueno, empezaré por decir que el ser humano no está midiendo las consecuencias de sus actos. No hay duda que uno de los mayores problemas a los que se enfrentan muchas poblaciones actualmente es la contaminación del agua. Principalmente porque como bien sabemos el agua prácticamente es fundamental para la vida, por lo que la contaminación puede ser algo muy negativo para el desarrollo tanto económico como social de los pueblos o de las poblaciones próximas en ese lugar contaminado. Los comienzos de la contaminación, como lo definen muchos expertos en la materia, la contaminación del agua es causada por las actividades humanas. Es un fenómeno ambiental de importancia, el cual se comienza a producir desde los primeros intentos de industrialización para transformarse luego en un problema tan habitual como generalizado. Generalmente la contaminación del agua se produce a través de la introducción directa o indirecta en los acuíferos o caos de agua, ríos, mares, lagos, océanos, etc. o de diversas sustancias que pueden ser consideradas como contaminantes. Pero existen dos formas principales de contaminación del agua. Una de ellas tiene que ver con la contaminación natural del agua que se corresponde con el ciclo natural de esta durante el que puede entrar en contacto con ciertos constituyentes contaminantes como sustancias minerales y orgánicas disueltas o en suspensión que se vierten en la corteza terrestre, la atmósfera y en las aguas. Pero todo esto se puede contradecir si el ser humano comía sus consecuencias, si no tirara basura a los lagos, a los ríos, no tirara botes de aceite, no contaminara. Bueno amigos, yo los invito a que no contaminen el agua y que sepan cuidar la naturaleza. Los saluda su buen amigo y compañero Julián Virreta. Nos vemos. ¡Claro!