mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 17:15:19 +00:00
Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
5d895d60b6 | |||
b71d45beff | |||
c4350356de | |||
361395187d | |||
7fc52fa7ef | |||
01e037c6c6 | |||
95f4fc70ca |
122
.github/workflows/build.yml
vendored
122
.github/workflows/build.yml
vendored
@ -113,125 +113,3 @@ jobs:
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [Win32, x64]
|
||||
sdl2: [ON]
|
||||
include:
|
||||
- arch: Win32
|
||||
s2arc: x86
|
||||
- arch: x64
|
||||
s2arc: x64
|
||||
- sdl2: ON
|
||||
s2ver: 2.26.0
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v1
|
||||
with:
|
||||
name: whisper-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
windows-blas:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
arch: [Win32, x64]
|
||||
blas: [ON]
|
||||
sdl2: [ON]
|
||||
include:
|
||||
- arch: Win32
|
||||
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
|
||||
s2arc: x86
|
||||
- arch: x64
|
||||
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
|
||||
s2arc: x64
|
||||
- sdl2: ON
|
||||
s2ver: 2.26.0
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
|
||||
- name: Fetch OpenBLAS
|
||||
if: matrix.blas == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
|
||||
7z x blas.zip -oblas -y
|
||||
copy blas/include/cblas.h .
|
||||
copy blas/include/openblas_config.h .
|
||||
echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
-DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
- name: Copy libopenblas.dll
|
||||
if: matrix.blas == 'ON'
|
||||
run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v1
|
||||
with:
|
||||
name: whisper-blas-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
13
.gitignore
vendored
13
.gitignore
vendored
@ -11,19 +11,12 @@ build-release/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
/main
|
||||
/stream
|
||||
/command
|
||||
/talk
|
||||
/bench
|
||||
|
||||
main
|
||||
stream
|
||||
bench
|
||||
sync.sh
|
||||
libwhisper.so
|
||||
compile_commands.json
|
||||
|
||||
examples/arm_neon.h
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
||||
|
||||
extra/bench-gg.txt
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
||||
[submodule "bindings/ios"]
|
||||
path = bindings/ios
|
||||
url = https://github.com/ggerganov/whisper.spm
|
@ -1,6 +1,5 @@
|
||||
cmake_minimum_required (VERSION 3.0)
|
||||
|
||||
project(whisper.cpp VERSION 1.0.4)
|
||||
project(whisper.cpp VERSION 1.0.0)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
@ -10,12 +9,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(WHISPER_STANDALONE ON)
|
||||
include(cmake/GitVars.cmake)
|
||||
include(cmake/BuildTypes.cmake)
|
||||
|
||||
# configure project version
|
||||
if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
|
||||
configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
|
||||
endif()
|
||||
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
|
||||
else()
|
||||
set(WHISPER_STANDALONE OFF)
|
||||
endif()
|
||||
@ -50,8 +43,6 @@ option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
|
||||
|
||||
if (APPLE)
|
||||
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
||||
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
|
||||
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
|
||||
else()
|
||||
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
||||
endif()
|
||||
@ -83,7 +74,7 @@ endif()
|
||||
# dependencies
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
@ -101,9 +92,7 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE)
|
||||
endif()
|
||||
|
||||
if (WHISPER_SUPPORT_OPENBLAS)
|
||||
find_library(OPENBLAS_LIB
|
||||
NAMES openblas libopenblas
|
||||
)
|
||||
find_library(OPENBLAS_LIB openblas)
|
||||
if (OPENBLAS_LIB)
|
||||
message(STATUS "OpenBLAS found")
|
||||
|
||||
@ -149,20 +138,15 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
|
||||
else()
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /arch:AVX2")
|
||||
else()
|
||||
if (EMSCRIPTEN)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
||||
# we require support for WASM SIMD 128-bit
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
else()
|
||||
if(NOT WHISPER_NO_AVX)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
||||
endif()
|
||||
if(NOT WHISPER_NO_AVX2)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
|
||||
endif()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
@ -204,10 +188,6 @@ if (BUILD_SHARED_LIBS)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
target_compile_definitions(${TARGET} PUBLIC
|
||||
${WHISPER_EXTRA_FLAGS}
|
||||
)
|
||||
@ -227,11 +207,13 @@ add_subdirectory(bindings)
|
||||
# programs, examples and tests
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS)
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
if (WHISPER_STANDALONE)
|
||||
if (WHISPER_BUILD_TESTS)
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
endif ()
|
||||
|
106
Makefile
106
Makefile
@ -27,8 +27,8 @@ endif
|
||||
# Compile flags
|
||||
#
|
||||
|
||||
CFLAGS = -I. -O3 -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
|
||||
CFLAGS = -I. -O3 -std=c11
|
||||
CXXFLAGS = -I. -I./examples -O3 -std=c++11
|
||||
LDFLAGS =
|
||||
|
||||
# OS specific
|
||||
@ -45,62 +45,16 @@ ifeq ($(UNAME_S),FreeBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Haiku)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),x86_64)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -mfma -mf16c
|
||||
AVX1_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell grep "fma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
endif
|
||||
# AVX 512
|
||||
CFLAGS += -mavx512f -mfma -mf16c
|
||||
|
||||
# AVX 256
|
||||
#CFLAGS += -mavx -mavx2 -mfma -mf16c
|
||||
endif
|
||||
ifeq ($(UNAME_M),amd64)
|
||||
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
||||
@ -112,14 +66,6 @@ ifndef WHISPER_NO_ACCELERATE
|
||||
LDFLAGS += -framework Accelerate
|
||||
endif
|
||||
endif
|
||||
ifdef WHISPER_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
ifdef WHISPER_GPROF
|
||||
CFLAGS += -pg
|
||||
CXXFLAGS += -pg
|
||||
endif
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
endif
|
||||
ifneq ($(filter armv6%,$(UNAME_M)),)
|
||||
@ -135,11 +81,13 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
default: main
|
||||
#
|
||||
# Build library + main
|
||||
#
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
main: examples/main/main.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp whisper.o ggml.o -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
@ -150,11 +98,8 @@ whisper.o: whisper.cpp whisper.h
|
||||
libwhisper.a: ggml.o whisper.o
|
||||
$(AR) rcs libwhisper.a ggml.o whisper.o
|
||||
|
||||
libwhisper.so: ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
|
||||
rm -f *.o main stream bench libwhisper.a
|
||||
|
||||
#
|
||||
# Examples
|
||||
@ -162,19 +107,9 @@ clean:
|
||||
|
||||
CC_SDL=`sdl2-config --cflags --libs`
|
||||
|
||||
main: examples/main/main.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
stream: examples/stream/stream.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
command: examples/command/command.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
bench: examples/bench/bench.cpp ggml.o whisper.o
|
||||
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
|
||||
|
||||
@ -213,10 +148,9 @@ samples:
|
||||
.PHONY: small
|
||||
.PHONY: medium.en
|
||||
.PHONY: medium
|
||||
.PHONY: large-v1
|
||||
.PHONY: large
|
||||
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
|
||||
tiny.en tiny base.en base small.en small medium.en medium large: main
|
||||
bash ./models/download-ggml-model.sh $@
|
||||
@echo ""
|
||||
@echo "==============================================="
|
||||
@ -225,17 +159,9 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
|
||||
@echo ""
|
||||
@for f in samples/*.wav; do \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "[+] Running base.en on $$f ... (run 'ffplay $$f' to listen)" ; \
|
||||
echo "----------------------------------------------" ; \
|
||||
echo "" ; \
|
||||
./main -m models/ggml-$@.bin -f $$f ; \
|
||||
echo "" ; \
|
||||
done
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
.PHONY: tests
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
|
142
README.md
142
README.md
@ -2,9 +2,6 @@
|
||||
|
||||
[](https://github.com/ggerganov/whisper.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
[Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
@ -21,11 +18,11 @@ Supported platforms:
|
||||
|
||||
- [x] Mac OS (Intel and Arm)
|
||||
- [x] [iOS](examples/whisper.objc)
|
||||
- [x] [Android](examples/whisper.android)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] Linux
|
||||
- [x] [WebAssembly](examples/whisper.wasm)
|
||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||
- [x] [Windows (MSVC and MinGW)](https://github.com/ggerganov/whisper.cpp/issues/5)
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/issues/7)
|
||||
- [x] [Android](https://github.com/ggerganov/whisper.cpp/issues/30)
|
||||
|
||||
The entire implementation of the model is contained in 2 source files:
|
||||
|
||||
@ -33,16 +30,10 @@ The entire implementation of the model is contained in 2 source files:
|
||||
- Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
|
||||
|
||||
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device:
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
|
||||
|
||||
You can also easily make your own offline voice assistant application: [command](examples/command)
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
|
||||
|
||||
Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
|
||||
|
||||
## Implementation details
|
||||
|
||||
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
|
||||
@ -55,6 +46,21 @@ The tensor operators are optimized heavily for Apple silicon CPUs. Depending on
|
||||
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
||||
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
||||
|
||||
## Limitations
|
||||
|
||||
- Inference only
|
||||
- No GPU support
|
||||
- Very basic greedy sampling scheme - always pick up the token with highest probability.
|
||||
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
|
||||
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
|
||||
to run the python code with the following parameters:
|
||||
|
||||
```
|
||||
whisper --best_of None --beam_size None ...
|
||||
```
|
||||
|
||||
In the future, `whisper.cpp` will support more sampling strategies.
|
||||
|
||||
## Quick start
|
||||
|
||||
First, download one of the Whisper models converted in [ggml format](models). For example:
|
||||
@ -88,27 +94,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
|
||||
usage: ./main [options] file0.wav file1.wav ...
|
||||
|
||||
options:
|
||||
-h, --help [default] show this help message and exit
|
||||
-t N, --threads N [4 ] number of threads to use during computation
|
||||
-p N, --processors N [1 ] number of processors to use during computation
|
||||
-ot N, --offset-t N [0 ] time offset in milliseconds
|
||||
-on N, --offset-n N [0 ] segment index offset
|
||||
-d N, --duration N [0 ] duration of audio to process in milliseconds
|
||||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
|
||||
-ml N, --max-len N [0 ] maximum segment length in characters
|
||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
|
||||
-tr, --translate [false ] translate from source language to english
|
||||
-otxt, --output-txt [false ] output result in a text file
|
||||
-ovtt, --output-vtt [false ] output result in a vtt file
|
||||
-osrt, --output-srt [false ] output result in a srt file
|
||||
-owts, --output-words [false ] output script for generating karaoke video
|
||||
-ps, --print-special [false ] print special tokens
|
||||
-pc, --print-colors [false ] print colors
|
||||
-nt, --no-timestamps [true ] do not print timestamps
|
||||
-l LANG, --language LANG [en ] spoken language
|
||||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||
-f FNAME, --file FNAME [ ] input WAV file path
|
||||
-h, --help show this help message and exit
|
||||
-s SEED, --seed SEED RNG seed (default: -1)
|
||||
-t N, --threads N number of threads to use during computation (default: 4)
|
||||
-p N, --processors N number of processors to use during computation (default: 1)
|
||||
-ot N, --offset-t N time offset in milliseconds (default: 0)
|
||||
-on N, --offset-n N segment index offset (default: 0)
|
||||
-mc N, --max-context N maximum number of text context tokens to store (default: max)
|
||||
-ml N, --max-len N maximum segment length in characters (default: 0)
|
||||
-wt N, --word-thold N word timestamp probability threshold (default: 0.010000)
|
||||
-v, --verbose verbose output
|
||||
--translate translate from source language to english
|
||||
-otxt, --output-txt output result in a text file
|
||||
-ovtt, --output-vtt output result in a vtt file
|
||||
-osrt, --output-srt output result in a srt file
|
||||
-owts, --output-words output script for generating karaoke video
|
||||
-ps, --print_special print special tokens
|
||||
-pc, --print_colors print colors
|
||||
-nt, --no_timestamps do not print timestamps
|
||||
-l LANG, --language LANG spoken language (default: en)
|
||||
-m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
|
||||
-f FNAME, --file FNAME input WAV file path
|
||||
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
Downloading ggml model base.en ...
|
||||
@ -140,13 +146,13 @@ whisper_model_load: n_text_layer = 6
|
||||
whisper_model_load: n_mels = 80
|
||||
whisper_model_load: f16 = 1
|
||||
whisper_model_load: type = 2
|
||||
whisper_model_load: mem_required = 670.00 MB
|
||||
whisper_model_load: adding 1607 extra tokens
|
||||
whisper_model_load: mem_required = 506.00 MB
|
||||
whisper_model_load: ggml ctx size = 140.60 MB
|
||||
whisper_model_load: memory size = 22.83 MB
|
||||
whisper_model_load: model size = 140.54 MB
|
||||
whisper_model_load: ggml ctx size = 140.60 MB
|
||||
whisper_model_load: memory size = 22.83 MB
|
||||
whisper_model_load: model size = 140.54 MB
|
||||
|
||||
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
|
||||
|
||||
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
|
||||
|
||||
@ -194,7 +200,6 @@ make small.en
|
||||
make small
|
||||
make medium.en
|
||||
make medium
|
||||
make large-v1
|
||||
make large
|
||||
```
|
||||
|
||||
@ -206,22 +211,7 @@ make large
|
||||
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||
|
||||
## Limitations
|
||||
|
||||
- Inference only
|
||||
- No GPU support
|
||||
- Very basic greedy sampling scheme - always pick up the token with highest probability.
|
||||
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
|
||||
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
|
||||
to run the python code with the following parameters:
|
||||
|
||||
```
|
||||
whisper --best_of None --beam_size None ...
|
||||
```
|
||||
|
||||
In the future, `whisper.cpp` will support more sampling strategies.
|
||||
| large | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
||||
|
||||
## Another example
|
||||
|
||||
@ -438,44 +428,18 @@ The original models are converted to a custom binary format. This allows to pack
|
||||
- vocabulary
|
||||
- weights
|
||||
|
||||
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
|
||||
or manually from here:
|
||||
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script or from here:
|
||||
|
||||
- https://huggingface.co/datasets/ggerganov/whisper.cpp
|
||||
- https://ggml.ggerganov.com
|
||||
https://ggml.ggerganov.com
|
||||
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
|
||||
in [models](models).
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README in [models](models).
|
||||
|
||||
## Bindings
|
||||
|
||||
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
|
||||
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
|
||||
- [X] Javascript: [bindings/javascript](bindings/javascript)
|
||||
- [ ] Python: soon
|
||||
- [ ] Python:
|
||||
- [ ] Java:
|
||||
|
||||
## Examples
|
||||
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder.
|
||||
Some of the examples are even ported to run in the browser using WebAssembly. Check them out!
|
||||
|
||||
| Example | Web | Description |
|
||||
| --- | --- | --- |
|
||||
| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
|
||||
| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
|
||||
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
|
||||
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
|
||||
| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
|
||||
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
|
||||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
|
||||
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
|
||||
|
||||
If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
|
||||
You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
|
||||
[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
|
||||
|
@ -1,19 +1,3 @@
|
||||
if (EMSCRIPTEN)
|
||||
add_subdirectory(javascript)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/javascript/publish.log
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/whisper.js
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/libwhisper.worker.js
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/package.json
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/javascript
|
||||
COMMAND npm publish
|
||||
COMMAND touch publish.log
|
||||
COMMENT "Publishing npm module v${PROJECT_VERSION}"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(publish-npm
|
||||
DEPENDS javascript/publish.log
|
||||
)
|
||||
endif()
|
||||
|
Submodule bindings/ios deleted from 1502317fe0
@ -9,33 +9,25 @@ target_link_libraries(${TARGET} PRIVATE
|
||||
)
|
||||
|
||||
unset(EXTRA_FLAGS)
|
||||
|
||||
if (WHISPER_WASM_SINGLE_FILE)
|
||||
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||
message(STATUS "Embedding WASM inside whisper.js")
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
TARGET libwhisper POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libwhisper.js
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libwhisper.worker.js
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/libwhisper.worker.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s MODULARIZE=1 \
|
||||
-s EXPORT_NAME=\"'whisper_factory'\" \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s ALLOW_MEMORY_GROWTH=1 \
|
||||
-s INITIAL_MEMORY=1610612736 \
|
||||
-s TOTAL_MEMORY=1610612736 \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
@ -1,78 +0,0 @@
|
||||
# whisper.cpp
|
||||
|
||||
Node.js package for Whisper speech recognition
|
||||
|
||||
Package: https://www.npmjs.com/package/whisper.cpp
|
||||
|
||||
## Details
|
||||
|
||||
The performance is comparable to when running `whisper.cpp` in the browser via WASM.
|
||||
|
||||
The API is currently very rudimentary: [bindings/javascript/emscripten.cpp](/bindings/javascript/emscripten.cpp)
|
||||
|
||||
For sample usage check [tests/test-whisper.js](/tests/test-whisper.js)
|
||||
|
||||
## Package building + test
|
||||
|
||||
```bash
|
||||
# load emscripten
|
||||
source /path/to/emsdk/emsdk_env.sh
|
||||
|
||||
# clone repo
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
|
||||
# grab base.en model
|
||||
./models/download-ggml-model.sh base.en
|
||||
|
||||
# prepare PCM sample for testing
|
||||
ffmpeg -i samples/jfk.wav -f f32le -acodec pcm_f32le samples/jfk.pcmf32
|
||||
|
||||
# build
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake .. && make -j
|
||||
|
||||
# run test
|
||||
node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
# publish npm package
|
||||
make publish-npm
|
||||
```
|
||||
|
||||
## Sample run
|
||||
|
||||
```java
|
||||
$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
whisper_model_load: loading model from 'whisper.bin'
|
||||
whisper_model_load: n_vocab = 51864
|
||||
whisper_model_load: n_audio_ctx = 1500
|
||||
whisper_model_load: n_audio_state = 512
|
||||
whisper_model_load: n_audio_head = 8
|
||||
whisper_model_load: n_audio_layer = 6
|
||||
whisper_model_load: n_text_ctx = 448
|
||||
whisper_model_load: n_text_state = 512
|
||||
whisper_model_load: n_text_head = 8
|
||||
whisper_model_load: n_text_layer = 6
|
||||
whisper_model_load: n_mels = 80
|
||||
whisper_model_load: f16 = 1
|
||||
whisper_model_load: type = 2
|
||||
whisper_model_load: adding 1607 extra tokens
|
||||
whisper_model_load: mem_required = 506.00 MB
|
||||
whisper_model_load: ggml ctx size = 140.60 MB
|
||||
whisper_model_load: memory size = 22.83 MB
|
||||
whisper_model_load: model size = 140.54 MB
|
||||
|
||||
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
|
||||
|
||||
operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
|
||||
|
||||
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
|
||||
|
||||
whisper_print_timings: load time = 162.37 ms
|
||||
whisper_print_timings: mel time = 183.70 ms
|
||||
whisper_print_timings: sample time = 4.27 ms
|
||||
whisper_print_timings: encode time = 8582.63 ms / 1430.44 ms per layer
|
||||
whisper_print_timings: decode time = 436.16 ms / 72.69 ms per layer
|
||||
whisper_print_timings: total time = 9370.90 ms
|
||||
```
|
@ -1,58 +1,59 @@
|
||||
//
|
||||
// This is the Javascript API of whisper.cpp
|
||||
//
|
||||
// Very crude at the moment.
|
||||
// Feel free to contribute and make this better!
|
||||
//
|
||||
// See the tests/test-whisper.js for sample usage
|
||||
//
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
|
||||
struct whisper_context * g_context;
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
EMSCRIPTEN_BINDINGS(whisper) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
if (g_context == nullptr) {
|
||||
g_context = whisper_init(path_model.c_str());
|
||||
if (g_context != nullptr) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([]() {
|
||||
if (g_context) {
|
||||
whisper_free(g_context);
|
||||
g_context = nullptr;
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
--index;
|
||||
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("full_default", emscripten::optional_override([](const emscripten::val & audio, const std::string & lang, bool translate) {
|
||||
if (g_context == nullptr) {
|
||||
emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
|
||||
--index;
|
||||
|
||||
if (index >= g_contexts.size()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (g_contexts[index] == nullptr) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = translate;
|
||||
params.language = whisper_is_multilingual(g_context) ? lang.c_str() : "en";
|
||||
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
|
||||
params.offset_ms = 0;
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special_tokens = false;
|
||||
params.translate = translate;
|
||||
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
||||
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
|
||||
params.offset_ms = 0;
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
const int n = audio["length"].as<int>();
|
||||
@ -67,11 +68,9 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
|
||||
// print system information
|
||||
{
|
||||
printf("\n");
|
||||
printf("system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
|
||||
printf("\n");
|
||||
printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
|
||||
__func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, 1,
|
||||
@ -81,13 +80,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// run whisper
|
||||
{
|
||||
whisper_reset_timings(g_context);
|
||||
whisper_full(g_context, params, pcmf32.data(), pcmf32.size());
|
||||
whisper_print_timings(g_context);
|
||||
}
|
||||
int ret = whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
|
||||
|
||||
return 0;
|
||||
whisper_print_timings(g_contexts[index]);
|
||||
|
||||
return ret;
|
||||
}));
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:function(f){(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f)},postMessage:function(msg){parentPort.postMessage(msg)},performance:global.performance||{now:function(){return Date.now()}}})}var initializedJS=false;var pendingNotifiedProxyingQueues=[];function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var instance=new WebAssembly.Instance(Module["wasmModule"],info);receiveInstance(instance);Module["wasmModule"]=null;return instance.exports};self.onunhandledrejection=e=>{throw e.reason??e};self.onmessage=e=>{try{if(e.data.cmd==="load"){Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=function(){postMessage({cmd:"callHandler",handler:handler,args:[...arguments]})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module).then(function(instance){Module=instance})}else if(e.data.cmd==="run"){Module["__performance_now_clock_drift"]=performance.now()-e.data.time;Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();pendingNotifiedProxyingQueues.forEach(queue=>{Module["executeNotifiedProxyingQueue"](queue)});pendingNotifiedProxyingQueues=[];initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){if(ex instanceof Module["ExitStatus"]){if(Module["keepRuntimeAlive"]()){}else{Module["__emscripten_thread_exit"](ex.status)}}else{throw ex}}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="processProxyingQueue"){if(initializedJS){Module["executeNotifiedProxyingQueue"](e.data.queue)}else{pendingNotifiedProxyingQueues.push(e.data.queue)}}else if(e.data.cmd){err("worker.js received unknown command "+e.data.cmd);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}};
|
@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "@PROJECT_VERSION@",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
"test": "echo \"todo: add tests\" && exit 0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/ggerganov/whisper.cpp"
|
||||
},
|
||||
"keywords": [
|
||||
"openai",
|
||||
"whisper",
|
||||
"speech-to-text",
|
||||
"speech-recognition",
|
||||
"transformer"
|
||||
],
|
||||
"author": "Georgi Gerganov",
|
||||
"license": "MIT",
|
||||
"bugs": {
|
||||
"url": "https://github.com/ggerganov/whisper.cpp/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ggerganov/whisper.cpp#readme"
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.0.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
"test": "echo \"todo: add tests\" && exit 0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/ggerganov/whisper.cpp"
|
||||
},
|
||||
"keywords": [
|
||||
"openai",
|
||||
"whisper",
|
||||
"speech-to-text",
|
||||
"speech-recognition",
|
||||
"transformer"
|
||||
],
|
||||
"author": "Georgi Gerganov",
|
||||
"license": "MIT",
|
||||
"bugs": {
|
||||
"url": "https://github.com/ggerganov/whisper.cpp/issues"
|
||||
},
|
||||
"homepage": "https://github.com/ggerganov/whisper.cpp#readme"
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -20,14 +20,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
add_subdirectory(whisper.wasm)
|
||||
add_subdirectory(stream.wasm)
|
||||
add_subdirectory(command.wasm)
|
||||
add_subdirectory(talk.wasm)
|
||||
add_subdirectory(bench.wasm)
|
||||
else()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(stream)
|
||||
add_subdirectory(command)
|
||||
add_subdirectory(bench)
|
||||
add_subdirectory(talk)
|
||||
endif()
|
||||
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# libbench
|
||||
#
|
||||
|
||||
set(TARGET libbench)
|
||||
|
||||
add_executable(${TARGET}
|
||||
emscripten.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
whisper
|
||||
)
|
||||
|
||||
unset(EXTRA_FLAGS)
|
||||
|
||||
if (WHISPER_WASM_SINGLE_FILE)
|
||||
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||
message(STATUS "Embedding WASM inside bench.js")
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libbench.js
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/bench.wasm/bench.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s INITIAL_MEMORY=1024MB \
|
||||
-s TOTAL_MEMORY=1024MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
#
|
||||
# bench.wasm
|
||||
#
|
||||
|
||||
set(TARGET bench.wasm)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
@ -1,22 +0,0 @@
|
||||
# bench.wasm
|
||||
|
||||
Benchmark the performance of whisper.cpp in the browser using WebAssembly
|
||||
|
||||
Link: https://whisper.ggerganov.com/bench/
|
||||
|
||||
Terminal version: [examples/bench](/examples/bench)
|
||||
|
||||
## Build instructions
|
||||
|
||||
```bash
|
||||
# build using Emscripten (v3.1.2)
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/bench.wasm/* /path/to/html/
|
||||
cp bin/libbench.worker.js /path/to/html/
|
||||
```
|
@ -1,80 +0,0 @@
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
// TODO: get rid of this vector of contexts - bad idea in the first place
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
std::thread g_worker;
|
||||
|
||||
void bench_main(size_t index) {
|
||||
const int n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
|
||||
// whisper context
|
||||
auto & ctx = g_contexts[index];
|
||||
|
||||
fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
|
||||
|
||||
if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
|
||||
fprintf(stderr, "error: failed to set mel: %d\n", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
|
||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "If you wish, you can submit these results here:\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Please include the following information:\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " - CPU model\n");
|
||||
fprintf(stderr, " - Operating system\n");
|
||||
fprintf(stderr, " - Browser\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(bench) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
g_worker = std::thread([i]() {
|
||||
bench_main(i);
|
||||
});
|
||||
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}));
|
||||
}
|
@ -1,227 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en-us">
|
||||
<head>
|
||||
<title>bench : Benchmark whisper.cpp performance in the browser</title>
|
||||
|
||||
<style>
|
||||
#output {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0 auto;
|
||||
margin-top: 10px;
|
||||
border-left: 0px;
|
||||
border-right: 0px;
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
display: block;
|
||||
background-color: black;
|
||||
color: white;
|
||||
font-size: 10px;
|
||||
font-family: 'Lucida Console', Monaco, monospace;
|
||||
outline: none;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
<b>bench : Benchmark whisper.cpp performance in the browser</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">GitHub</a>.
|
||||
|
||||
<br><br>
|
||||
|
||||
<hr>
|
||||
|
||||
Select the model you would like to use and click the "Bench" button.<br>
|
||||
The results will be displayed in the textarea below.
|
||||
|
||||
<br><br>
|
||||
|
||||
<div id="model-whisper">
|
||||
Whisper model: <span id="model-whisper-status"></span>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<span id="fetch-whisper-progress"></span>
|
||||
|
||||
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="input">
|
||||
<button id="bench" onclick="onBench()" disabled>Bench</button>
|
||||
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
Debug output:
|
||||
<textarea id="output" rows="20"></textarea>
|
||||
|
||||
<br>
|
||||
|
||||
<b>Troubleshooting</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
The page does some heavy computations, so make sure:
|
||||
|
||||
<ul>
|
||||
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
||||
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
||||
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
||||
</ul>
|
||||
|
||||
<div class="cell-version">
|
||||
<span>
|
||||
|
|
||||
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
||||
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
||||
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
||||
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">Source Code</a> |
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript" src="helpers.js"></script>
|
||||
<script type='text/javascript'>
|
||||
// the bench instance
|
||||
var instance = null;
|
||||
|
||||
// model name
|
||||
var model_whisper = null;
|
||||
|
||||
var Module = {
|
||||
print: printTextarea,
|
||||
printErr: printTextarea,
|
||||
setStatus: function(text) {
|
||||
printTextarea('js: ' + text);
|
||||
},
|
||||
monitorRunDependencies: function(left) {
|
||||
},
|
||||
preRun: function() {
|
||||
printTextarea('js: Preparing ...');
|
||||
},
|
||||
postRun: function() {
|
||||
printTextarea('js: Initialized successfully!');
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// fetch models
|
||||
//
|
||||
|
||||
let dbVersion = 1
|
||||
let dbName = 'whisper.ggerganov.com';
|
||||
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
||||
|
||||
function storeFS(fname, buf) {
|
||||
// write to WASM file using FS_createDataFile
|
||||
// if the file exists, delete it
|
||||
try {
|
||||
Module.FS_unlink(fname);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
Module.FS_createDataFile("/", fname, buf, true, true);
|
||||
|
||||
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
||||
|
||||
model_whisper = fname;
|
||||
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
||||
|
||||
if (model_whisper != null) {
|
||||
document.getElementById('bench').disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function loadFile(event, fname) {
|
||||
var file = event.target.files[0] || null;
|
||||
if (file == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
|
||||
printTextarea('loadFile: please wait ...');
|
||||
|
||||
var reader = new FileReader();
|
||||
reader.onload = function(event) {
|
||||
var buf = new Uint8Array(reader.result);
|
||||
storeFS(fname, buf);
|
||||
}
|
||||
reader.readAsArrayBuffer(file);
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('whisper-file' ).style.display = 'none';
|
||||
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
||||
}
|
||||
|
||||
function loadWhisper(model) {
|
||||
let urls = {
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'tiny.en': 75,
|
||||
'base.en': 142,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
let dst = 'whisper.bin';
|
||||
let size_mb = sizes[model];
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-whisper-progress');
|
||||
el.innerHTML = Math.round(100*p) + '%';
|
||||
};
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
}
|
||||
|
||||
//
|
||||
// main
|
||||
//
|
||||
|
||||
function onBench() {
|
||||
if (instance) {
|
||||
Module.free(instance);
|
||||
}
|
||||
|
||||
instance = Module.init('whisper.bin');
|
||||
|
||||
if (instance) {
|
||||
printTextarea("js: whisper initialized, instance: " + instance);
|
||||
}
|
||||
|
||||
document.getElementById('bench').disabled = true;
|
||||
|
||||
if (!instance) {
|
||||
printTextarea("js: failed to initialize whisper");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
||||
<script type="text/javascript" src="bench.js"></script>
|
||||
</body>
|
||||
</html>
|
@ -1,8 +1,6 @@
|
||||
# bench
|
||||
|
||||
A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of
|
||||
the transformer on some random audio data and records the execution time. This way we can have an objective comparison
|
||||
of the performance of the model for various setups.
|
||||
A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of the transformer on some random audio data and records the execution time. This way we can have an objective comparison of the performance of the model for various setups.
|
||||
|
||||
Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
|
||||
|
||||
|
@ -6,9 +6,9 @@
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
@ -17,13 +17,14 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
if (arg == "-t" || arg == "--threads") {
|
||||
params.n_threads = std::stoi(argv[++i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
params.model = argv[++i];
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else {
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
@ -38,9 +39,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# libcommand
|
||||
#
|
||||
|
||||
set(TARGET libcommand)
|
||||
|
||||
add_executable(${TARGET}
|
||||
emscripten.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
whisper
|
||||
)
|
||||
|
||||
unset(EXTRA_FLAGS)
|
||||
|
||||
if (WHISPER_WASM_SINGLE_FILE)
|
||||
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||
message(STATUS "Embedding WASM inside command.js")
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libcommand.js
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s INITIAL_MEMORY=1024MB \
|
||||
-s TOTAL_MEMORY=1024MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
#
|
||||
# command.wasm
|
||||
#
|
||||
|
||||
set(TARGET command.wasm)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
@ -1,23 +0,0 @@
|
||||
# command.wasm
|
||||
|
||||
This is a basic Voice Assistant example that accepts voice commands from the microphone.
|
||||
It runs in fully in the browser via WebAseembly.
|
||||
|
||||
Online demo: https://whisper.ggerganov.com/command/
|
||||
|
||||
Terminal version: [examples/command](/examples/command)
|
||||
|
||||
## Build instructions
|
||||
|
||||
```bash
|
||||
# build using Emscripten (v3.1.2)
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/command.wasm/* /path/to/html/
|
||||
cp bin/libcommand.worker.js /path/to/html/
|
||||
```
|
@ -1,408 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
std::mutex g_mutex;
|
||||
std::thread g_worker;
|
||||
|
||||
std::atomic<bool> g_running(false);
|
||||
|
||||
std::string g_status = "";
|
||||
std::string g_status_forced = "";
|
||||
std::string g_transcribed = "";
|
||||
|
||||
std::vector<float> g_pcmf32;
|
||||
|
||||
static std::string trim(const std::string & s) {
|
||||
std::regex e("^\\s+|\\s+$");
|
||||
return std::regex_replace(s, e, "");
|
||||
}
|
||||
|
||||
static void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||
const float dt = 1.0f / sample_rate;
|
||||
const float alpha = dt / (rc + dt);
|
||||
|
||||
float y = data[0];
|
||||
|
||||
for (size_t i = 1; i < data.size(); i++) {
|
||||
y = alpha * (y + data[i] - data[i - 1]);
|
||||
data[i] = y;
|
||||
}
|
||||
}
|
||||
|
||||
// compute similarity between two strings using Levenshtein distance
|
||||
static float similarity(const std::string & s0, const std::string & s1) {
|
||||
const size_t len0 = s0.size() + 1;
|
||||
const size_t len1 = s1.size() + 1;
|
||||
|
||||
std::vector<int> col(len1, 0);
|
||||
std::vector<int> prevCol(len1, 0);
|
||||
|
||||
for (size_t i = 0; i < len1; i++) {
|
||||
prevCol[i] = i;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len0; i++) {
|
||||
col[0] = i;
|
||||
for (size_t j = 1; j < len1; j++) {
|
||||
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
|
||||
}
|
||||
col.swap(prevCol);
|
||||
}
|
||||
|
||||
const float dist = prevCol[len1 - 1];
|
||||
|
||||
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
||||
}
|
||||
|
||||
void command_set_status(const std::string & status) {
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status = status;
|
||||
}
|
||||
|
||||
bool command_vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||
const int n_samples = pcmf32.size();
|
||||
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||
|
||||
if (n_samples_last >= n_samples) {
|
||||
// not enough samples - assume no speech
|
||||
return false;
|
||||
}
|
||||
|
||||
if (freq_thold > 0.0f) {
|
||||
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||
}
|
||||
|
||||
float energy_all = 0.0f;
|
||||
float energy_last = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < n_samples; i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= n_samples - n_samples_last) {
|
||||
energy_last += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= n_samples;
|
||||
energy_last /= n_samples_last;
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||
}
|
||||
|
||||
if (energy_last > vad_thold*energy_all) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
prob = 0.0f;
|
||||
t_ms = 0;
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
int prob_n = 0;
|
||||
std::string result;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
result += text;
|
||||
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const auto token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
prob += token.p;
|
||||
++prob_n;
|
||||
}
|
||||
}
|
||||
|
||||
if (prob_n > 0) {
|
||||
prob /= prob_n;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
|
||||
const int64_t n_samples = (ms * sample_rate) / 1000;
|
||||
|
||||
int64_t n_take = 0;
|
||||
if (g_pcmf32.size() < n_samples) {
|
||||
n_take = g_pcmf32.size();
|
||||
} else {
|
||||
n_take = n_samples;
|
||||
}
|
||||
|
||||
audio.resize(n_take);
|
||||
std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
|
||||
}
|
||||
|
||||
void command_main(size_t index) {
|
||||
command_set_status("loading data ...");
|
||||
|
||||
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
wparams.offset_ms = 0;
|
||||
wparams.translate = false;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = false;
|
||||
wparams.print_timestamps = true;
|
||||
wparams.print_special = false;
|
||||
|
||||
wparams.max_tokens = 32;
|
||||
wparams.audio_ctx = 768; // partial encoder context for better performance
|
||||
|
||||
wparams.language = "en";
|
||||
|
||||
printf("command: using %d threads\n", wparams.n_threads);
|
||||
|
||||
bool is_running = true;
|
||||
bool have_prompt = false;
|
||||
bool ask_prompt = true;
|
||||
bool print_energy = false;
|
||||
|
||||
float prob0 = 0.0f;
|
||||
float prob = 0.0f;
|
||||
|
||||
std::vector<float> pcmf32_cur;
|
||||
std::vector<float> pcmf32_prompt;
|
||||
|
||||
const std::string k_prompt = "Ok Whisper, start listening for commands.";
|
||||
|
||||
// whisper context
|
||||
auto & ctx = g_contexts[index];
|
||||
|
||||
const int32_t vad_ms = 2000;
|
||||
const int32_t prompt_ms = 5000;
|
||||
const int32_t command_ms = 4000;
|
||||
|
||||
const float vad_thold = 0.1f;
|
||||
const float freq_thold = -1.0f;
|
||||
|
||||
while (g_running) {
|
||||
// delay
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
if (ask_prompt) {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
{
|
||||
char txt[1024];
|
||||
snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
|
||||
command_set_status(txt);
|
||||
}
|
||||
|
||||
ask_prompt = false;
|
||||
}
|
||||
|
||||
int64_t t_ms = 0;
|
||||
|
||||
{
|
||||
command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||
|
||||
if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
|
||||
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||
command_set_status("Speech detected! Processing ...");
|
||||
|
||||
if (!have_prompt) {
|
||||
command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||
|
||||
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
|
||||
|
||||
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
|
||||
|
||||
const float sim = similarity(txt, k_prompt);
|
||||
|
||||
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
|
||||
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
|
||||
ask_prompt = true;
|
||||
} else {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
|
||||
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
{
|
||||
char txt[1024];
|
||||
snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
|
||||
command_set_status(txt);
|
||||
}
|
||||
|
||||
// save the audio for the prompt
|
||||
pcmf32_prompt = pcmf32_cur;
|
||||
have_prompt = true;
|
||||
}
|
||||
} else {
|
||||
command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
|
||||
|
||||
// prepend the prompt audio
|
||||
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
|
||||
|
||||
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
|
||||
|
||||
prob = 100.0f*(prob - prob0);
|
||||
|
||||
fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
|
||||
|
||||
// find the prompt in the text
|
||||
float best_sim = 0.0f;
|
||||
size_t best_len = 0;
|
||||
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
|
||||
const auto prompt = txt.substr(0, n);
|
||||
|
||||
const float sim = similarity(prompt, k_prompt);
|
||||
|
||||
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
|
||||
|
||||
if (sim > best_sim) {
|
||||
best_sim = sim;
|
||||
best_len = n;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string command = ::trim(txt.substr(best_len));
|
||||
|
||||
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
{
|
||||
char txt[1024];
|
||||
snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
|
||||
command_set_status(txt);
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_transcribed = command;
|
||||
}
|
||||
}
|
||||
|
||||
g_pcmf32.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(command) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
g_running = true;
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
g_worker = std::thread([i]() {
|
||||
command_main(i);
|
||||
});
|
||||
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
if (g_running) {
|
||||
g_running = false;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
|
||||
--index;
|
||||
|
||||
if (index >= g_contexts.size()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (g_contexts[index] == nullptr) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
const int n = audio["length"].as<int>();
|
||||
|
||||
emscripten::val heap = emscripten::val::module_property("HEAPU8");
|
||||
emscripten::val memory = heap["buffer"];
|
||||
|
||||
g_pcmf32.resize(n);
|
||||
|
||||
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
|
||||
memoryView.call<void>("set", audio);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}));
|
||||
|
||||
emscripten::function("get_transcribed", emscripten::optional_override([]() {
|
||||
std::string transcribed;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
transcribed = std::move(g_transcribed);
|
||||
}
|
||||
|
||||
return transcribed;
|
||||
}));
|
||||
|
||||
emscripten::function("get_status", emscripten::optional_override([]() {
|
||||
std::string status;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
status = g_status_forced.empty() ? g_status : g_status_forced;
|
||||
}
|
||||
|
||||
return status;
|
||||
}));
|
||||
|
||||
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status_forced = status;
|
||||
}
|
||||
}));
|
||||
}
|
@ -1,386 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en-us">
|
||||
<head>
|
||||
<title>command : Voice assistant example using Whisper + WebAssembly</title>
|
||||
|
||||
<style>
|
||||
#output {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0 auto;
|
||||
margin-top: 10px;
|
||||
border-left: 0px;
|
||||
border-right: 0px;
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
display: block;
|
||||
background-color: black;
|
||||
color: white;
|
||||
font-size: 10px;
|
||||
font-family: 'Lucida Console', Monaco, monospace;
|
||||
outline: none;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
<b>command : Voice assistant example using Whisper + WebAssembly</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
|
||||
|
||||
<br><br>
|
||||
|
||||
<hr>
|
||||
|
||||
Select the model you would like to use, click the "Start" button and follow the instructions.
|
||||
|
||||
<br><br>
|
||||
|
||||
<div id="model-whisper">
|
||||
Whisper model: <span id="model-whisper-status"></span>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<span id="fetch-whisper-progress"></span>
|
||||
|
||||
<!--
|
||||
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||
-->
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="input">
|
||||
<button id="start" onclick="onStart()" disabled>Start</button>
|
||||
<button id="stop" onclick="onStop()" disabled>Stop</button>
|
||||
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="state">
|
||||
Status: <b><span id="state-status">not started</span></b>
|
||||
|
||||
<pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
Debug output:
|
||||
<textarea id="output" rows="20"></textarea>
|
||||
|
||||
<br>
|
||||
|
||||
<b>Troubleshooting</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
The page does some heavy computations, so make sure:
|
||||
|
||||
<ul>
|
||||
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
||||
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
||||
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
||||
</ul>
|
||||
|
||||
<div class="cell-version">
|
||||
<span>
|
||||
|
|
||||
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
||||
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
||||
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
||||
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript" src="helpers.js"></script>
|
||||
<script type='text/javascript'>
|
||||
// web audio context
|
||||
var context = null;
|
||||
|
||||
// audio data
|
||||
var audio = null;
|
||||
var audio0 = null;
|
||||
|
||||
// the command instance
|
||||
var instance = null;
|
||||
|
||||
// model name
|
||||
var model_whisper = null;
|
||||
|
||||
var Module = {
|
||||
print: printTextarea,
|
||||
printErr: printTextarea,
|
||||
setStatus: function(text) {
|
||||
printTextarea('js: ' + text);
|
||||
},
|
||||
monitorRunDependencies: function(left) {
|
||||
},
|
||||
preRun: function() {
|
||||
printTextarea('js: Preparing ...');
|
||||
},
|
||||
postRun: function() {
|
||||
printTextarea('js: Initialized successfully!');
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// fetch models
|
||||
//
|
||||
|
||||
let dbVersion = 1
|
||||
let dbName = 'whisper.ggerganov.com';
|
||||
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
||||
|
||||
function storeFS(fname, buf) {
|
||||
// write to WASM file using FS_createDataFile
|
||||
// if the file exists, delete it
|
||||
try {
|
||||
Module.FS_unlink(fname);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
Module.FS_createDataFile("/", fname, buf, true, true);
|
||||
|
||||
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
||||
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
||||
|
||||
if (model_whisper != null) {
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop' ).disabled = true;
|
||||
}
|
||||
}
|
||||
|
||||
function loadWhisper(model) {
|
||||
let urls = {
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'tiny.en': 75,
|
||||
'base.en': 142,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
let dst = 'whisper.bin';
|
||||
let size_mb = sizes[model];
|
||||
|
||||
model_whisper = model;
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-whisper-progress');
|
||||
el.innerHTML = Math.round(100*p) + '%';
|
||||
};
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
}
|
||||
|
||||
//
|
||||
// microphone
|
||||
//
|
||||
|
||||
const kSampleRate = 16000;
|
||||
const kRestartRecording_s = 120;
|
||||
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
|
||||
|
||||
var mediaRecorder = null;
|
||||
var doRecording = false;
|
||||
var startTime = 0;
|
||||
|
||||
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
||||
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
||||
|
||||
function stopRecording() {
|
||||
Module.set_status("paused");
|
||||
doRecording = false;
|
||||
audio0 = null;
|
||||
audio = null;
|
||||
context = null;
|
||||
}
|
||||
|
||||
function startRecording() {
|
||||
if (!context) {
|
||||
context = new AudioContext({
|
||||
sampleRate: kSampleRate,
|
||||
channelCount: 1,
|
||||
echoCancellation: false,
|
||||
autoGainControl: true,
|
||||
noiseSuppression: true,
|
||||
});
|
||||
}
|
||||
|
||||
Module.set_status("");
|
||||
|
||||
document.getElementById('start').disabled = true;
|
||||
document.getElementById('stop').disabled = false;
|
||||
|
||||
doRecording = true;
|
||||
startTime = Date.now();
|
||||
|
||||
var chunks = [];
|
||||
var stream = null;
|
||||
|
||||
navigator.mediaDevices.getUserMedia({audio: true, video: false})
|
||||
.then(function(s) {
|
||||
stream = s;
|
||||
mediaRecorder = new MediaRecorder(stream);
|
||||
mediaRecorder.ondataavailable = function(e) {
|
||||
chunks.push(e.data);
|
||||
|
||||
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
|
||||
var reader = new FileReader();
|
||||
|
||||
reader.onload = function(event) {
|
||||
var buf = new Uint8Array(reader.result);
|
||||
|
||||
if (!context) {
|
||||
return;
|
||||
}
|
||||
context.decodeAudioData(buf.buffer, function(audioBuffer) {
|
||||
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
|
||||
var source = offlineContext.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
source.connect(offlineContext.destination);
|
||||
source.start(0);
|
||||
|
||||
offlineContext.startRendering().then(function(renderedBuffer) {
|
||||
audio = renderedBuffer.getChannelData(0);
|
||||
|
||||
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
|
||||
|
||||
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
|
||||
if (audio0 != null) {
|
||||
audioAll.set(audio0, 0);
|
||||
}
|
||||
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
|
||||
|
||||
if (instance) {
|
||||
Module.set_audio(instance, audioAll);
|
||||
}
|
||||
});
|
||||
}, function(e) {
|
||||
audio = null;
|
||||
});
|
||||
}
|
||||
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = function(e) {
|
||||
if (doRecording) {
|
||||
setTimeout(function() {
|
||||
startRecording();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
mediaRecorder.start(kIntervalAudio_ms);
|
||||
})
|
||||
.catch(function(err) {
|
||||
printTextarea('js: error getting audio stream: ' + err);
|
||||
});
|
||||
|
||||
var interval = setInterval(function() {
|
||||
if (!doRecording) {
|
||||
clearInterval(interval);
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop').disabled = true;
|
||||
|
||||
mediaRecorder = null;
|
||||
}
|
||||
|
||||
// if audio length is more than kRestartRecording_s seconds, restart recording
|
||||
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
|
||||
if (doRecording) {
|
||||
//printTextarea('js: restarting recording');
|
||||
|
||||
clearInterval(interval);
|
||||
audio0 = audio;
|
||||
audio = null;
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
}
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
|
||||
//
|
||||
// main
|
||||
//
|
||||
|
||||
var nLines = 0;
|
||||
var intervalUpdate = null;
|
||||
var transcribedAll = '';
|
||||
|
||||
function onStart() {
|
||||
if (!instance) {
|
||||
instance = Module.init('whisper.bin');
|
||||
|
||||
if (instance) {
|
||||
printTextarea("js: whisper initialized, instance: " + instance);
|
||||
}
|
||||
}
|
||||
|
||||
if (!instance) {
|
||||
printTextarea("js: failed to initialize whisper");
|
||||
return;
|
||||
}
|
||||
|
||||
startRecording();
|
||||
|
||||
intervalUpdate = setInterval(function() {
|
||||
var transcribed = Module.get_transcribed();
|
||||
|
||||
if (transcribed != null && transcribed.length > 1) {
|
||||
transcribedAll += transcribed + '<br>';
|
||||
nLines++;
|
||||
|
||||
// if more than 10 lines, remove the first line
|
||||
if (nLines > 10) {
|
||||
var i = transcribedAll.indexOf('<br>');
|
||||
if (i > 0) {
|
||||
transcribedAll = transcribedAll.substring(i + 4);
|
||||
nLines--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById('state-status').innerHTML = Module.get_status();
|
||||
document.getElementById('state-transcribed').innerHTML = transcribedAll;
|
||||
}, 100);
|
||||
}
|
||||
|
||||
function onStop() {
|
||||
stopRecording();
|
||||
}
|
||||
|
||||
</script>
|
||||
<script type="text/javascript" src="command.js"></script>
|
||||
</body>
|
||||
</html>
|
@ -1,7 +0,0 @@
|
||||
if (WHISPER_SUPPORT_SDL2)
|
||||
# command
|
||||
set(TARGET command)
|
||||
add_executable(${TARGET} command.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||
target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif ()
|
@ -1,36 +0,0 @@
|
||||
# command
|
||||
|
||||
This is a basic Voice Assistant example that accepts voice commands from the microphone.
|
||||
More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
|
||||
|
||||
```bash
|
||||
# Run with default arguments and small model
|
||||
./command -m ./models/ggml-small.en.bin -t 8
|
||||
|
||||
# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
|
||||
./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
|
||||
|
||||
# Run in guided mode, the list of allowed commands is in commands.txt
|
||||
./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
|
||||
|
||||
# On Raspberry Pi, in guided mode you can use "-ac 128" for extra performance
|
||||
./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
|
||||
```
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
|
||||
|
||||
Web version: [examples/command.wasm](/examples/command.wasm)
|
||||
|
||||
## Building
|
||||
|
||||
The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
||||
|
||||
```bash
|
||||
# Install SDL2 on Linux
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
# Install SDL2 on Mac OS
|
||||
brew install sdl2
|
||||
|
||||
make command
|
||||
```
|
@ -1,866 +0,0 @@
|
||||
// Voice assistant example
|
||||
//
|
||||
// Speak short text commands to the microphone.
|
||||
// This program will detect your voice command and convert them to text.
|
||||
//
|
||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
|
||||
//
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
#include <SDL.h>
|
||||
#include <SDL_audio.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t prompt_ms = 5000;
|
||||
int32_t command_ms = 4000;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool print_energy = false;
|
||||
bool no_timestamps = true;
|
||||
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
std::string fname_out = "";
|
||||
std::string commands = "";
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
||||
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
|
||||
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
//
|
||||
// SDL Audio capture
|
||||
//
|
||||
|
||||
class audio_async {
|
||||
public:
|
||||
audio_async(int len_ms);
|
||||
~audio_async();
|
||||
|
||||
bool init(int capture_id, int sample_rate);
|
||||
|
||||
// start capturing audio via the provided SDL callback
|
||||
// keep last len_ms seconds of audio in a circular buffer
|
||||
bool resume();
|
||||
bool pause();
|
||||
bool clear();
|
||||
|
||||
// callback to be called by SDL
|
||||
void callback(uint8_t * stream, int len);
|
||||
|
||||
// get audio data from the circular buffer
|
||||
void get(int ms, std::vector<float> & audio);
|
||||
|
||||
private:
|
||||
SDL_AudioDeviceID m_dev_id_in = 0;
|
||||
|
||||
int m_len_ms = 0;
|
||||
int m_sample_rate = 0;
|
||||
|
||||
bool m_running = false;
|
||||
std::mutex m_mutex;
|
||||
|
||||
std::vector<float> m_audio;
|
||||
std::vector<float> m_audio_new;
|
||||
size_t m_audio_pos = 0;
|
||||
size_t m_audio_len = 0;
|
||||
};
|
||||
|
||||
audio_async::audio_async(int len_ms) {
|
||||
m_len_ms = len_ms;
|
||||
}
|
||||
|
||||
audio_async::~audio_async() {
|
||||
if (m_dev_id_in) {
|
||||
SDL_CloseAudioDevice(m_dev_id_in);
|
||||
}
|
||||
}
|
||||
|
||||
bool audio_async::init(int capture_id, int sample_rate) {
|
||||
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
||||
|
||||
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
||||
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
||||
|
||||
{
|
||||
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
||||
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
||||
}
|
||||
}
|
||||
|
||||
SDL_AudioSpec capture_spec_requested;
|
||||
SDL_AudioSpec capture_spec_obtained;
|
||||
|
||||
SDL_zero(capture_spec_requested);
|
||||
SDL_zero(capture_spec_obtained);
|
||||
|
||||
capture_spec_requested.freq = sample_rate;
|
||||
capture_spec_requested.format = AUDIO_F32;
|
||||
capture_spec_requested.channels = 1;
|
||||
capture_spec_requested.samples = 1024;
|
||||
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
|
||||
audio_async * audio = (audio_async *) userdata;
|
||||
audio->callback(stream, len);
|
||||
};
|
||||
capture_spec_requested.userdata = this;
|
||||
|
||||
if (capture_id >= 0) {
|
||||
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
||||
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
||||
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
}
|
||||
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
||||
m_dev_id_in = 0;
|
||||
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
|
||||
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
||||
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
|
||||
capture_spec_requested.format);
|
||||
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
|
||||
capture_spec_requested.channels);
|
||||
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
||||
}
|
||||
|
||||
m_sample_rate = capture_spec_obtained.freq;
|
||||
|
||||
m_audio.resize((m_sample_rate*m_len_ms)/1000);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::resume() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_running) {
|
||||
fprintf(stderr, "%s: already running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 0);
|
||||
|
||||
m_running = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::pause() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: already paused!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 1);
|
||||
|
||||
m_running = false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::clear() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
m_audio_pos = 0;
|
||||
m_audio_len = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// callback to be called by SDL
|
||||
void audio_async::callback(uint8_t * stream, int len) {
|
||||
if (!m_running) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n_samples = len / sizeof(float);
|
||||
|
||||
m_audio_new.resize(n_samples);
|
||||
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
|
||||
|
||||
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (m_audio_pos + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - m_audio_pos;
|
||||
|
||||
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
||||
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = m_audio.size();
|
||||
} else {
|
||||
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void audio_async::get(int ms, std::vector<float> & result) {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (ms <= 0) {
|
||||
ms = m_len_ms;
|
||||
}
|
||||
|
||||
size_t n_samples = (m_sample_rate * ms) / 1000;
|
||||
if (n_samples > m_audio_len) {
|
||||
n_samples = m_audio_len;
|
||||
}
|
||||
|
||||
result.resize(n_samples);
|
||||
|
||||
int s0 = m_audio_pos - n_samples;
|
||||
if (s0 < 0) {
|
||||
s0 += m_audio.size();
|
||||
}
|
||||
|
||||
if (s0 + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - s0;
|
||||
|
||||
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
|
||||
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
|
||||
} else {
|
||||
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
|
||||
std::string trim(const std::string & s) {
|
||||
std::regex e("^\\s+|\\s+$");
|
||||
return std::regex_replace(s, e, "");
|
||||
}
|
||||
|
||||
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||
const float dt = 1.0f / sample_rate;
|
||||
const float alpha = dt / (rc + dt);
|
||||
|
||||
float y = data[0];
|
||||
|
||||
for (size_t i = 1; i < data.size(); i++) {
|
||||
y = alpha * (y + data[i] - data[i - 1]);
|
||||
data[i] = y;
|
||||
}
|
||||
}
|
||||
|
||||
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||
const int n_samples = pcmf32.size();
|
||||
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||
|
||||
if (n_samples_last >= n_samples) {
|
||||
// not enough samples - assume no speech
|
||||
return false;
|
||||
}
|
||||
|
||||
if (freq_thold > 0.0f) {
|
||||
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||
}
|
||||
|
||||
float energy_all = 0.0f;
|
||||
float energy_last = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < n_samples; i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= n_samples - n_samples_last) {
|
||||
energy_last += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= n_samples;
|
||||
energy_last /= n_samples_last;
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||
}
|
||||
|
||||
if (energy_last > vad_thold*energy_all) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
prob = 0.0f;
|
||||
t_ms = 0;
|
||||
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
int prob_n = 0;
|
||||
std::string result;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
result += text;
|
||||
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const auto token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
prob += token.p;
|
||||
++prob_n;
|
||||
}
|
||||
}
|
||||
|
||||
if (prob_n > 0) {
|
||||
prob /= prob_n;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// compute similarity between two strings using Levenshtein distance
|
||||
float similarity(const std::string & s0, const std::string & s1) {
|
||||
const size_t len0 = s0.size() + 1;
|
||||
const size_t len1 = s1.size() + 1;
|
||||
|
||||
std::vector<int> col(len1, 0);
|
||||
std::vector<int> prevCol(len1, 0);
|
||||
|
||||
for (size_t i = 0; i < len1; i++) {
|
||||
prevCol[i] = i;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len0; i++) {
|
||||
col[0] = i;
|
||||
for (size_t j = 1; j < len1; j++) {
|
||||
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
|
||||
}
|
||||
col.swap(prevCol);
|
||||
}
|
||||
|
||||
const float dist = prevCol[len1 - 1];
|
||||
|
||||
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
||||
}
|
||||
|
||||
std::vector<std::string> read_allowed_commands(const std::string & fname) {
|
||||
std::vector<std::string> allowed_commands;
|
||||
|
||||
std::ifstream ifs(fname);
|
||||
if (!ifs.is_open()) {
|
||||
return allowed_commands;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(ifs, line)) {
|
||||
line = trim(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::transform(line.begin(), line.end(),line.begin(), ::tolower);
|
||||
allowed_commands.push_back(std::move(line));
|
||||
}
|
||||
|
||||
return allowed_commands;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
|
||||
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__,
|
||||
params.n_threads,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// init audio
|
||||
|
||||
audio_async audio(30*1000);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
|
||||
// wait for 1 second to avoid any buffered noise
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
||||
audio.clear();
|
||||
|
||||
int max_len = 0;
|
||||
|
||||
bool is_running = true;
|
||||
bool have_prompt = false;
|
||||
bool ask_prompt = true;
|
||||
|
||||
float prob0 = 0.0f;
|
||||
float prob = 0.0f;
|
||||
|
||||
std::vector<float> pcmf32_cur;
|
||||
std::vector<float> pcmf32_prompt;
|
||||
|
||||
std::vector<std::string> allowed_commands;
|
||||
std::vector<std::vector<whisper_token>> allowed_tokens;
|
||||
|
||||
std::string k_prompt = "";
|
||||
std::vector<whisper_token> k_tokens;
|
||||
|
||||
if (params.commands != "") {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: guided mode\n", __func__);
|
||||
|
||||
allowed_commands = read_allowed_commands(params.commands);
|
||||
|
||||
if (allowed_commands.empty()) {
|
||||
fprintf(stderr, "%s: error: failed to read allowed commands from '%s'\n", __func__, params.commands.c_str());
|
||||
return 2;
|
||||
}
|
||||
|
||||
for (const auto & cmd : allowed_commands) {
|
||||
whisper_token tokens[1024];
|
||||
allowed_tokens.emplace_back();
|
||||
|
||||
for (int l = 0; l < cmd.size(); ++l) {
|
||||
// NOTE: very important to add the whitespace !
|
||||
// the reason is that the first decoded token starts with a whitespace too!
|
||||
std::string ss = std::string(" ") + cmd.substr(0, l + 1);
|
||||
|
||||
const int n = whisper_tokenize(ctx, ss.c_str(), tokens, 1024);
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, cmd.c_str());
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (n == 1) {
|
||||
allowed_tokens.back().push_back(tokens[0]);
|
||||
}
|
||||
}
|
||||
|
||||
max_len = std::max(max_len, (int) cmd.size());
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: allowed commands [ tokens ]:\n", __func__);
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
|
||||
fprintf(stderr, " - \033[1m%-*s\033[0m = [", max_len, allowed_commands[i].c_str());
|
||||
for (const auto & token : allowed_tokens[i]) {
|
||||
fprintf(stderr, " %5d", token);
|
||||
}
|
||||
fprintf(stderr, " ]\n");
|
||||
}
|
||||
|
||||
k_prompt = "select one from the available words: ";
|
||||
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
|
||||
if (i > 0) {
|
||||
k_prompt += ", ";
|
||||
}
|
||||
k_prompt += allowed_commands[i];
|
||||
}
|
||||
k_prompt += ". selected word: ";
|
||||
|
||||
// tokenize prompt
|
||||
{
|
||||
k_tokens.resize(1024);
|
||||
const int n = whisper_tokenize(ctx, k_prompt.c_str(), k_tokens.data(), 1024);
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "%s: error: failed to tokenize prompt '%s'\n", __func__, k_prompt.c_str());
|
||||
return 4;
|
||||
}
|
||||
k_tokens.resize(n);
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, k_prompt.c_str());
|
||||
fprintf(stderr, "%s: tokens: [", __func__);
|
||||
for (const auto & token : k_tokens) {
|
||||
fprintf(stderr, " %d", token);
|
||||
}
|
||||
fprintf(stderr, " ]\n");
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: listening for a command ...\n", __func__);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: general-purpose mode\n", __func__);
|
||||
|
||||
k_prompt = "Ok Whisper, start listening for commands.";
|
||||
}
|
||||
|
||||
// main loop
|
||||
while (is_running) {
|
||||
// handle Ctrl + C
|
||||
{
|
||||
SDL_Event event;
|
||||
while (SDL_PollEvent(&event)) {
|
||||
switch (event.type) {
|
||||
case SDL_QUIT:
|
||||
{
|
||||
is_running = false;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// delay
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
if (allowed_commands.empty()) {
|
||||
// general-purpose mode
|
||||
// freely transcribe the voice into text
|
||||
|
||||
if (ask_prompt) {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
ask_prompt = false;
|
||||
}
|
||||
|
||||
{
|
||||
int64_t t_ms = 0;
|
||||
|
||||
audio.get(2000, pcmf32_cur);
|
||||
|
||||
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
||||
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||
|
||||
if (!have_prompt) {
|
||||
// wait for activation phrase
|
||||
audio.get(params.prompt_ms, pcmf32_cur);
|
||||
|
||||
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
|
||||
|
||||
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
|
||||
|
||||
const float sim = similarity(txt, k_prompt);
|
||||
|
||||
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
|
||||
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
|
||||
ask_prompt = true;
|
||||
} else {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
|
||||
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
// save the audio for the prompt
|
||||
pcmf32_prompt = pcmf32_cur;
|
||||
have_prompt = true;
|
||||
}
|
||||
} else {
|
||||
// we have heard the activation phrase, now detect the commands
|
||||
audio.get(params.command_ms, pcmf32_cur);
|
||||
|
||||
// prepend the prompt audio
|
||||
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
|
||||
|
||||
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
|
||||
|
||||
prob = 100.0f*(prob - prob0);
|
||||
|
||||
//fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
|
||||
|
||||
// find the prompt in the text
|
||||
float best_sim = 0.0f;
|
||||
size_t best_len = 0;
|
||||
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
|
||||
const auto prompt = txt.substr(0, n);
|
||||
|
||||
const float sim = similarity(prompt, k_prompt);
|
||||
|
||||
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
|
||||
|
||||
if (sim > best_sim) {
|
||||
best_sim = sim;
|
||||
best_len = n;
|
||||
}
|
||||
}
|
||||
|
||||
const std::string command = ::trim(txt.substr(best_len));
|
||||
|
||||
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
audio.clear();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// command-list mode
|
||||
// guide the transcription to match the most likely command from a provided list
|
||||
|
||||
audio.get(2000, pcmf32_cur);
|
||||
|
||||
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
||||
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = 1;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
wparams.prompt_tokens = k_tokens.data();
|
||||
wparams.prompt_n_tokens = k_tokens.size();
|
||||
|
||||
// run the transformer and a single decoding pass
|
||||
if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
|
||||
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
|
||||
break;
|
||||
}
|
||||
|
||||
const auto * probs = whisper_get_probs(ctx);
|
||||
std::vector<std::pair<float, int>> probs_id;
|
||||
|
||||
double psum = 0.0;
|
||||
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
|
||||
probs_id.push_back(std::make_pair(probs[allowed_tokens[i][0]], i));
|
||||
for (int j = 1; j < (int) allowed_tokens[i].size(); ++j) {
|
||||
probs_id.back().first += probs[allowed_tokens[i][j]];
|
||||
}
|
||||
probs_id.back().first /= allowed_tokens[i].size();
|
||||
psum += probs_id.back().first;
|
||||
}
|
||||
|
||||
// normalize
|
||||
for (auto & p : probs_id) {
|
||||
p.first /= psum;
|
||||
}
|
||||
|
||||
// sort descending
|
||||
{
|
||||
using pair_type = decltype(probs_id)::value_type;
|
||||
std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
}
|
||||
|
||||
// print the commands and the respective probabilities
|
||||
{
|
||||
fprintf(stdout, "\n");
|
||||
for (const auto & cmd : probs_id) {
|
||||
fprintf(stdout, "%s: %s%-*s%s = %f | ", __func__, "\033[1m", max_len, allowed_commands[cmd.second].c_str(), "\033[0m", cmd.first);
|
||||
for (int i = 0; i < (int) allowed_tokens[cmd.second].size(); ++i) {
|
||||
fprintf(stdout, "'%4s' %f ", whisper_token_to_str(ctx, allowed_tokens[cmd.second][i]), probs[allowed_tokens[cmd.second][i]]);
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
// best command
|
||||
{
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
|
||||
"\033[1m", allowed_commands[probs_id[0].second].c_str(), "\033[0m", probs_id[0].first,
|
||||
(int) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t_start).count());
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
audio.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audio.pause();
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
enable
|
||||
disable
|
||||
cat
|
||||
dog
|
||||
apple
|
||||
red
|
||||
blue
|
||||
green
|
||||
lightblue
|
@ -1,60 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Simple tool to record audio from the microphone and generate a karaoke video
|
||||
# Usage:
|
||||
#
|
||||
# cd whisper.cpp
|
||||
# make
|
||||
#
|
||||
# ./examples/generate-karaoke.sh [model] [step_ms]
|
||||
#
|
||||
# Press Ctrl+C to stop recording
|
||||
#
|
||||
|
||||
executable="./main"
|
||||
model="base.en"
|
||||
model_path="models/ggml-$model.bin"
|
||||
|
||||
# require sox and ffmpeg to be installed
|
||||
if ! command -v sox &> /dev/null
|
||||
then
|
||||
echo "sox could not be found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v ffmpeg &> /dev/null
|
||||
then
|
||||
echo "ffmpeg could not be found"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ ! -f "$executable" ]; then
|
||||
echo "'$executable' does not exist. Please build it first."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [ ! -f "$model_path" ]; then
|
||||
echo "'$model_path' does not exist. Please download it first."
|
||||
exit 4
|
||||
fi
|
||||
|
||||
# record some raw audio
|
||||
sox -d rec.wav
|
||||
|
||||
# resample to 16kHz
|
||||
ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
|
||||
|
||||
# run Whisper
|
||||
echo "Processing ..."
|
||||
./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
|
||||
|
||||
# generate Karaoke video
|
||||
echo "Generating video ..."
|
||||
source rec16.wav.wts > /dev/null 2>&1
|
||||
|
||||
# play the video
|
||||
echo "Playing ./rec16.wav.mp4 ..."
|
||||
ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
|
||||
|
||||
echo "Done"
|
||||
exit 0
|
@ -1,182 +0,0 @@
|
||||
// Common Javascript functions used by the examples
|
||||
|
||||
function convertTypedArray(src, type) {
|
||||
var buffer = new ArrayBuffer(src.byteLength);
|
||||
var baseView = new src.constructor(buffer).set(src);
|
||||
return new type(buffer);
|
||||
}
|
||||
|
||||
var printTextarea = (function() {
|
||||
var element = document.getElementById('output');
|
||||
if (element) element.alue = ''; // clear browser cache
|
||||
return function(text) {
|
||||
if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
|
||||
console.log(text);
|
||||
if (element) {
|
||||
element.value += text + "\n";
|
||||
element.scrollTop = element.scrollHeight; // focus on bottom
|
||||
}
|
||||
};
|
||||
})();
|
||||
|
||||
async function clearCache() {
|
||||
if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
|
||||
indexedDB.deleteDatabase(dbName);
|
||||
}
|
||||
}
|
||||
|
||||
// fetch a remote file from remote URL using the Fetch API
|
||||
async function fetchRemote(url, cbProgress, cbPrint) {
|
||||
cbPrint('fetchRemote: downloading with fetch()...');
|
||||
|
||||
const response = await fetch(
|
||||
url,
|
||||
{
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Content-Type': 'application/octet-stream',
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
cbPrint('fetchRemote: failed to fetch ' + url);
|
||||
return;
|
||||
}
|
||||
|
||||
const contentLength = response.headers.get('content-length');
|
||||
const total = parseInt(contentLength, 10);
|
||||
const reader = response.body.getReader();
|
||||
|
||||
var chunks = [];
|
||||
var receivedLength = 0;
|
||||
var progressLast = -1;
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
|
||||
chunks.push(value);
|
||||
receivedLength += value.length;
|
||||
|
||||
if (contentLength) {
|
||||
cbProgress(receivedLength/total);
|
||||
|
||||
var progressCur = Math.round((receivedLength / total) * 10);
|
||||
if (progressCur != progressLast) {
|
||||
cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
|
||||
progressLast = progressCur;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var position = 0;
|
||||
var chunksAll = new Uint8Array(receivedLength);
|
||||
|
||||
for (var chunk of chunks) {
|
||||
chunksAll.set(chunk, position);
|
||||
position += chunk.length;
|
||||
}
|
||||
|
||||
return chunksAll;
|
||||
}
|
||||
|
||||
// load remote data
|
||||
// - check if the data is already in the IndexedDB
|
||||
// - if not, fetch it from the remote URL and store it in the IndexedDB
|
||||
function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
|
||||
// query the storage quota and print it
|
||||
navigator.storage.estimate().then(function (estimate) {
|
||||
cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
|
||||
cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
|
||||
});
|
||||
|
||||
// check if the data is already in the IndexedDB
|
||||
var rq = indexedDB.open(dbName, dbVersion);
|
||||
|
||||
rq.onupgradeneeded = function (event) {
|
||||
var db = event.target.result;
|
||||
if (db.version == 1) {
|
||||
var os = db.createObjectStore('models', { autoIncrement: false });
|
||||
cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
|
||||
} else {
|
||||
// clear the database
|
||||
var os = event.currentTarget.transaction.objectStore('models');
|
||||
os.clear();
|
||||
cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
|
||||
}
|
||||
};
|
||||
|
||||
rq.onsuccess = function (event) {
|
||||
var db = event.target.result;
|
||||
var tx = db.transaction(['models'], 'readonly');
|
||||
var os = tx.objectStore('models');
|
||||
var rq = os.get(url);
|
||||
|
||||
rq.onsuccess = function (event) {
|
||||
if (rq.result) {
|
||||
cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
|
||||
cbReady(dst, rq.result);
|
||||
} else {
|
||||
// data is not in the IndexedDB
|
||||
cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
|
||||
|
||||
// alert and ask the user to confirm
|
||||
if (!confirm(
|
||||
'You are about to download ' + size_mb + ' MB of data.\n' +
|
||||
'The model data will be cached in the browser for future use.\n\n' +
|
||||
'Press OK to continue.')) {
|
||||
cbCancel();
|
||||
return;
|
||||
}
|
||||
|
||||
fetchRemote(url, cbProgress, cbPrint).then(function (data) {
|
||||
if (data) {
|
||||
// store the data in the IndexedDB
|
||||
var rq = indexedDB.open(dbName, dbVersion);
|
||||
rq.onsuccess = function (event) {
|
||||
var db = event.target.result;
|
||||
var tx = db.transaction(['models'], 'readwrite');
|
||||
var os = tx.objectStore('models');
|
||||
var rq = os.put(data, url);
|
||||
|
||||
rq.onsuccess = function (event) {
|
||||
cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
|
||||
cbReady(dst, data);
|
||||
};
|
||||
|
||||
rq.onerror = function (event) {
|
||||
cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
|
||||
cbCancel();
|
||||
};
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
rq.onerror = function (event) {
|
||||
cbPrint('loadRemote: failed to get data from the IndexedDB');
|
||||
cbCancel();
|
||||
};
|
||||
};
|
||||
|
||||
rq.onerror = function (event) {
|
||||
cbPrint('loadRemote: failed to open IndexedDB');
|
||||
cbCancel();
|
||||
};
|
||||
|
||||
rq.onblocked = function (event) {
|
||||
cbPrint('loadRemote: failed to open IndexedDB: blocked');
|
||||
cbCancel();
|
||||
};
|
||||
|
||||
rq.onabort = function (event) {
|
||||
cbPrint('loadRemote: failed to open IndexedDB: abort');
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -1,112 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
|
||||
# Idea by @semiformal-net
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
|
||||
#
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
|
||||
fmt=aac # the audio format extension of the stream (TODO: auto detect)
|
||||
step_s=30
|
||||
model="base.en"
|
||||
|
||||
check_requirements()
|
||||
{
|
||||
if ! command -v ./main &>/dev/null; then
|
||||
echo "whisper.cpp main executable is required (make)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v ffmpeg &>/dev/null; then
|
||||
echo "ffmpeg is required (https://ffmpeg.org)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_requirements
|
||||
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: $0 stream_url [step_s] [model]"
|
||||
echo ""
|
||||
echo " Example:"
|
||||
echo " $0 $url $step_s $model"
|
||||
echo ""
|
||||
echo "No url specified, using default: $url"
|
||||
else
|
||||
url="$1"
|
||||
fi
|
||||
|
||||
if [ -n "$2" ]; then
|
||||
step_s="$2"
|
||||
fi
|
||||
|
||||
if [ -n "$3" ]; then
|
||||
model="$3"
|
||||
fi
|
||||
|
||||
# Whisper models
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
||||
|
||||
# list available models
|
||||
function list_models {
|
||||
printf "\n"
|
||||
printf " Available models:"
|
||||
for model in "${models[@]}"; do
|
||||
printf " $model"
|
||||
done
|
||||
printf "\n\n"
|
||||
}
|
||||
|
||||
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
||||
printf "Invalid model: $model\n"
|
||||
list_models
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
running=1
|
||||
|
||||
trap "running=0" SIGINT SIGTERM
|
||||
|
||||
printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
|
||||
|
||||
# continuous stream in native fmt (this file will grow forever!)
|
||||
ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
|
||||
if [ $? -ne 0 ]; then
|
||||
printf "Error: ffmpeg failed to capture audio stream\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf "Buffering audio. Please wait...\n\n"
|
||||
sleep $(($step_s))
|
||||
|
||||
# do not stop script on error
|
||||
set +e
|
||||
|
||||
i=0
|
||||
SECONDS=0
|
||||
while [ $running -eq 1 ]; do
|
||||
# extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
|
||||
err=1
|
||||
while [ $err -ne 0 ]; do
|
||||
if [ $i -gt 0 ]; then
|
||||
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||
else
|
||||
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||
fi
|
||||
err=$(cat /tmp/whisper-live.err | wc -l)
|
||||
done
|
||||
|
||||
./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
|
||||
|
||||
while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
|
||||
sleep 1
|
||||
done
|
||||
((i=i+1))
|
||||
done
|
||||
|
||||
killall -v ffmpeg
|
||||
killall -v main
|
@ -6,28 +6,29 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
|
||||
```
|
||||
./main -h
|
||||
|
||||
usage: ./main [options] file0.wav file1.wav ...
|
||||
usage: ./bin/main [options] file0.wav file1.wav ...
|
||||
|
||||
-h, --help show this help message and exit
|
||||
-s SEED, --seed SEED RNG seed (default: -1)
|
||||
-t N, --threads N number of threads to use during computation (default: 4)
|
||||
-p N, --processors N number of processors to use during computation (default: 1)
|
||||
-ot N, --offset-t N time offset in milliseconds (default: 0)
|
||||
-on N, --offset-n N segment index offset (default: 0)
|
||||
-mc N, --max-context N maximum number of text context tokens to store (default: max)
|
||||
-ml N, --max-len N maximum segment length in characters (default: 0)
|
||||
-wt N, --word-thold N word timestamp probability threshold (default: 0.010000)
|
||||
-v, --verbose verbose output
|
||||
--translate translate from source language to english
|
||||
-otxt, --output-txt output result in a text file
|
||||
-ovtt, --output-vtt output result in a vtt file
|
||||
-osrt, --output-srt output result in a srt file
|
||||
-owts, --output-words output script for generating karaoke video
|
||||
-ps, --print_special print special tokens
|
||||
-pc, --print_colors print colors
|
||||
-nt, --no_timestamps do not print timestamps
|
||||
-l LANG, --language LANG spoken language (default: en)
|
||||
-m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
|
||||
-f FNAME, --file FNAME input WAV file path
|
||||
-h, --help show this help message and exit
|
||||
|
||||
options:
|
||||
-h, --help [default] show this help message and exit
|
||||
-t N, --threads N [4 ] number of threads to use during computation
|
||||
-p N, --processors N [1 ] number of processors to use during computation
|
||||
-ot N, --offset-t N [0 ] time offset in milliseconds
|
||||
-on N, --offset-n N [0 ] segment index offset
|
||||
-d N, --duration N [0 ] duration of audio to process in milliseconds
|
||||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
|
||||
-ml N, --max-len N [0 ] maximum segment length in characters
|
||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
|
||||
-tr, --translate [false ] translate from source language to english
|
||||
-otxt, --output-txt [false ] output result in a text file
|
||||
-ovtt, --output-vtt [false ] output result in a vtt file
|
||||
-osrt, --output-srt [false ] output result in a srt file
|
||||
-owts, --output-words [false ] output script for generating karaoke video
|
||||
-ps, --print-special [false ] print special tokens
|
||||
-pc, --print-colors [false ] print colors
|
||||
-nt, --no-timestamps [true ] do not print timestamps
|
||||
-l LANG, --language LANG [en ] spoken language
|
||||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||
-f FNAME, --file FNAME [ ] input WAV file path
|
||||
```
|
||||
|
@ -36,10 +36,6 @@ std::string to_timestamp(int64_t t, bool comma = false) {
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
int timestamp_to_sample(int64_t t, int n_samples) {
|
||||
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
|
||||
}
|
||||
|
||||
// helper function to replace substrings
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
for (size_t pos = 0; ; pos += replace.length()) {
|
||||
@ -52,31 +48,28 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t seed = -1; // RNG seed, not used currently
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_processors = 1;
|
||||
int32_t offset_t_ms = 0;
|
||||
int32_t offset_n = 0;
|
||||
int32_t duration_ms = 0;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
|
||||
float word_thold = 0.01f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool diarize = false;
|
||||
bool output_txt = false;
|
||||
bool output_vtt = false;
|
||||
bool output_srt = false;
|
||||
bool output_wts = false;
|
||||
bool print_special = false;
|
||||
bool print_colors = false;
|
||||
bool print_progress = false;
|
||||
bool no_timestamps = false;
|
||||
bool verbose = false;
|
||||
bool translate = false;
|
||||
bool output_txt = false;
|
||||
bool output_vtt = false;
|
||||
bool output_srt = false;
|
||||
bool output_wts = false;
|
||||
bool print_special_tokens = false;
|
||||
bool print_colors = false;
|
||||
bool no_timestamps = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string prompt = "";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
|
||||
std::vector<std::string> fname_inp = {};
|
||||
};
|
||||
@ -92,34 +85,55 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
params.seed = std::stoi(argv[++i]);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
params.n_threads = std::stoi(argv[++i]);
|
||||
} else if (arg == "-p" || arg == "--processors") {
|
||||
params.n_processors = std::stoi(argv[++i]);
|
||||
} else if (arg == "-ot" || arg == "--offset-t") {
|
||||
params.offset_t_ms = std::stoi(argv[++i]);
|
||||
} else if (arg == "-on" || arg == "--offset-n") {
|
||||
params.offset_n = std::stoi(argv[++i]);
|
||||
} else if (arg == "-mc" || arg == "--max-context") {
|
||||
params.max_context = std::stoi(argv[++i]);
|
||||
} else if (arg == "-ml" || arg == "--max-len") {
|
||||
params.max_len = std::stoi(argv[++i]);
|
||||
} else if (arg == "-wt" || arg == "--word-thold") {
|
||||
params.word_thold = std::stof(argv[++i]);
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "--translate") {
|
||||
params.translate = true;
|
||||
} else if (arg == "-l" || arg == "--language") {
|
||||
params.language = argv[++i];
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
} else if (arg == "-otxt" || arg == "--output-txt") {
|
||||
params.output_txt = true;
|
||||
} else if (arg == "-ovtt" || arg == "--output-vtt") {
|
||||
params.output_vtt = true;
|
||||
} else if (arg == "-osrt" || arg == "--output-srt") {
|
||||
params.output_srt = true;
|
||||
} else if (arg == "-owts" || arg == "--output-words") {
|
||||
params.output_wts = true;
|
||||
} else if (arg == "-ps" || arg == "--print_special") {
|
||||
params.print_special_tokens = true;
|
||||
} else if (arg == "-pc" || arg == "--print_colors") {
|
||||
params.print_colors = true;
|
||||
} else if (arg == "-nt" || arg == "--no_timestamps") {
|
||||
params.no_timestamps = true;
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
params.model = argv[++i];
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
params.fname_inp.push_back(argv[++i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
|
||||
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
||||
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
||||
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
|
||||
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
||||
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
||||
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.push_back(argv[++i]); }
|
||||
else {
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
@ -134,42 +148,32 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
||||
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
||||
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
||||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
||||
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
||||
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
||||
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
||||
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
||||
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
|
||||
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
|
||||
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
|
||||
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
||||
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
||||
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
||||
fprintf(stderr, " -v, --verbose verbose output\n");
|
||||
fprintf(stderr, " --translate translate from source language to english\n");
|
||||
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
|
||||
fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n");
|
||||
fprintf(stderr, " -osrt, --output-srt output result in a srt file\n");
|
||||
fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n");
|
||||
fprintf(stderr, " -ps, --print_special print special tokens\n");
|
||||
fprintf(stderr, " -pc, --print_colors print colors\n");
|
||||
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
||||
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
struct whisper_print_user_data {
|
||||
const whisper_params * params;
|
||||
|
||||
const std::vector<std::vector<float>> * pcmf32s;
|
||||
};
|
||||
|
||||
void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
|
||||
const auto & params = *((whisper_print_user_data *) user_data)->params;
|
||||
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
|
||||
const whisper_params & params = *(whisper_params *) user_data;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
|
||||
@ -183,7 +187,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
||||
if (params.no_timestamps) {
|
||||
if (params.print_colors) {
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special == false) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
@ -206,37 +210,10 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
std::string speaker = "";
|
||||
|
||||
if (params.diarize && pcmf32s.size() == 2) {
|
||||
const int64_t n_samples = pcmf32s[0].size();
|
||||
|
||||
const int64_t is0 = timestamp_to_sample(t0, n_samples);
|
||||
const int64_t is1 = timestamp_to_sample(t1, n_samples);
|
||||
|
||||
double energy0 = 0.0f;
|
||||
double energy1 = 0.0f;
|
||||
|
||||
for (int64_t j = is0; j < is1; j++) {
|
||||
energy0 += fabs(pcmf32s[0][j]);
|
||||
energy1 += fabs(pcmf32s[1][j]);
|
||||
}
|
||||
|
||||
if (energy0 > 1.1*energy1) {
|
||||
speaker = "(speaker 0)";
|
||||
} else if (energy1 > 1.1*energy0) {
|
||||
speaker = "(speaker 1)";
|
||||
} else {
|
||||
speaker = "(speaker ?)";
|
||||
}
|
||||
|
||||
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
|
||||
}
|
||||
|
||||
if (params.print_colors) {
|
||||
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special == false) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
@ -248,13 +225,13 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
||||
|
||||
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||
|
||||
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
||||
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
|
||||
}
|
||||
printf("\n");
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
printf("[%s --> %s] %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
|
||||
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -272,7 +249,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname) {
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
fout << text << "\n";
|
||||
fout << text;
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -282,7 +259,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname) {
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
return 9;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
@ -405,9 +382,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
ncnt += txt.size();
|
||||
}
|
||||
|
||||
::replace_all(txt_bg, "'", "\u2019");
|
||||
::replace_all(txt_bg, "'", "’");
|
||||
::replace_all(txt_bg, "\"", "\\\"");
|
||||
::replace_all(txt_fg, "'", "\u2019");
|
||||
::replace_all(txt_fg, "'", "’");
|
||||
::replace_all(txt_fg, "\"", "\\\"");
|
||||
}
|
||||
|
||||
@ -447,18 +424,16 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.seed < 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
if (params.fname_inp.empty()) {
|
||||
fprintf(stderr, "error: no input files specified\n");
|
||||
whisper_print_usage(argc, argv, params);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
|
||||
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
||||
@ -468,79 +443,35 @@ int main(int argc, char ** argv) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
// initial prompt
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
if (params.prompt.size() > 0) {
|
||||
prompt_tokens.resize(1024);
|
||||
prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
|
||||
fprintf(stderr, "initial tokens: [ ");
|
||||
for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
|
||||
fprintf(stderr, "%d ", prompt_tokens[i]);
|
||||
}
|
||||
fprintf(stderr, "]\n");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
// WAV input
|
||||
std::vector<float> pcmf32;
|
||||
{
|
||||
drwav wav;
|
||||
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
||||
|
||||
if (fname_inp == "-") {
|
||||
{
|
||||
uint8_t buf[1024];
|
||||
while (true)
|
||||
{
|
||||
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
wav_data.insert(wav_data.end(), buf, buf + n);
|
||||
}
|
||||
}
|
||||
|
||||
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false) {
|
||||
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
||||
return 4;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
||||
}
|
||||
else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
|
||||
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
||||
return 5;
|
||||
if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
|
||||
fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
|
||||
whisper_print_usage(argc, argv, {});
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (wav.channels != 1 && wav.channels != 2) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
|
||||
return 6;
|
||||
}
|
||||
|
||||
if (params.diarize && wav.channels != 2 && params.no_timestamps == false) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", argv[0], fname_inp.c_str());
|
||||
return 6;
|
||||
return 5;
|
||||
}
|
||||
|
||||
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
|
||||
return 8;
|
||||
return 6;
|
||||
}
|
||||
|
||||
if (wav.bitsPerSample != 16) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
|
||||
return 9;
|
||||
return 7;
|
||||
}
|
||||
|
||||
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
||||
int n = wav.totalPCMFrameCount;
|
||||
|
||||
std::vector<int16_t> pcm16;
|
||||
pcm16.resize(n*wav.channels);
|
||||
@ -558,18 +489,6 @@ int main(int argc, char ** argv) {
|
||||
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.diarize) {
|
||||
// convert to stereo, float
|
||||
pcmf32s.resize(2);
|
||||
|
||||
pcmf32s[0].resize(n);
|
||||
pcmf32s[1].resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
||||
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
@ -599,54 +518,34 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
||||
// run the inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = params.print_progress;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
wparams.duration_ms = params.duration_ms;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special_tokens = params.print_special_tokens;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
wparams.prompt_tokens = prompt_tokens.size() == 0 ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.size() == 0 ? 0 : prompt_tokens.size();
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// example for abort mechanism
|
||||
// in this example, we do not abort the processing, but we could if the flag is set to true
|
||||
// the callback is called before every encoder run - if it returns false, the processing is aborted
|
||||
{
|
||||
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
wparams.new_segment_callback_user_data = ¶ms;
|
||||
}
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
||||
return 10;
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,47 +0,0 @@
|
||||
#
|
||||
# libstream
|
||||
#
|
||||
|
||||
set(TARGET libstream)
|
||||
|
||||
add_executable(${TARGET}
|
||||
emscripten.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
whisper
|
||||
)
|
||||
|
||||
unset(EXTRA_FLAGS)
|
||||
|
||||
if (WHISPER_WASM_SINGLE_FILE)
|
||||
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||
message(STATUS "Embedding WASM inside stream.js")
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libstream.js
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s INITIAL_MEMORY=1024MB \
|
||||
-s TOTAL_MEMORY=1024MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
#
|
||||
# stream.wasm
|
||||
#
|
||||
|
||||
set(TARGET stream.wasm)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
@ -1,20 +0,0 @@
|
||||
# stream.wasm
|
||||
|
||||
Real-time transcription in the browser using WebAssembly
|
||||
|
||||
Online demo: https://whisper.ggerganov.com/stream/
|
||||
|
||||
## Build instructions
|
||||
|
||||
```bash
|
||||
# build using Emscripten (v3.1.2)
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/stream.wasm/* /path/to/html/
|
||||
cp bin/libstream.worker.js /path/to/html/
|
||||
```
|
@ -1,213 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
std::mutex g_mutex;
|
||||
std::thread g_worker;
|
||||
|
||||
std::atomic<bool> g_running(false);
|
||||
|
||||
std::string g_status = "";
|
||||
std::string g_status_forced = "";
|
||||
std::string g_transcribed = "";
|
||||
|
||||
std::vector<float> g_pcmf32;
|
||||
|
||||
void stream_set_status(const std::string & status) {
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status = status;
|
||||
}
|
||||
|
||||
void stream_main(size_t index) {
|
||||
stream_set_status("loading data ...");
|
||||
|
||||
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
wparams.offset_ms = 0;
|
||||
wparams.translate = false;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = false;
|
||||
wparams.print_timestamps = true;
|
||||
wparams.print_special = false;
|
||||
|
||||
wparams.max_tokens = 32;
|
||||
wparams.audio_ctx = 768; // partial encoder context for better performance
|
||||
|
||||
wparams.language = "en";
|
||||
|
||||
printf("stream: using %d threads\n", wparams.n_threads);
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
|
||||
// whisper context
|
||||
auto & ctx = g_contexts[index];
|
||||
|
||||
// 5 seconds interval
|
||||
const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
|
||||
|
||||
while (g_running) {
|
||||
stream_set_status("waiting for audio ...");
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(g_mutex);
|
||||
|
||||
if (g_pcmf32.size() < 1024) {
|
||||
lock.unlock();
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
|
||||
g_pcmf32.clear();
|
||||
}
|
||||
|
||||
{
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
stream_set_status("running whisper ...");
|
||||
|
||||
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
|
||||
if (ret != 0) {
|
||||
printf("whisper_full() failed: %d\n", ret);
|
||||
break;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
|
||||
}
|
||||
|
||||
{
|
||||
std::string text_heard;
|
||||
|
||||
{
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = n_segments - 1; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
printf("transcribed: %s\n", text);
|
||||
|
||||
text_heard += text;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_transcribed = text_heard;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(stream) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
g_running = true;
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
g_worker = std::thread([i]() {
|
||||
stream_main(i);
|
||||
});
|
||||
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
if (g_running) {
|
||||
g_running = false;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
|
||||
--index;
|
||||
|
||||
if (index >= g_contexts.size()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (g_contexts[index] == nullptr) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
const int n = audio["length"].as<int>();
|
||||
|
||||
emscripten::val heap = emscripten::val::module_property("HEAPU8");
|
||||
emscripten::val memory = heap["buffer"];
|
||||
|
||||
g_pcmf32.resize(n);
|
||||
|
||||
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
|
||||
memoryView.call<void>("set", audio);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}));
|
||||
|
||||
emscripten::function("get_transcribed", emscripten::optional_override([]() {
|
||||
std::string transcribed;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
transcribed = std::move(g_transcribed);
|
||||
}
|
||||
|
||||
return transcribed;
|
||||
}));
|
||||
|
||||
emscripten::function("get_status", emscripten::optional_override([]() {
|
||||
std::string status;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
status = g_status_forced.empty() ? g_status : g_status_forced;
|
||||
}
|
||||
|
||||
return status;
|
||||
}));
|
||||
|
||||
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status_forced = status;
|
||||
}
|
||||
}));
|
||||
}
|
@ -1,386 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en-us">
|
||||
<head>
|
||||
<title>stream : Real-time Whisper transcription in WebAssembly</title>
|
||||
|
||||
<style>
|
||||
#output {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0 auto;
|
||||
margin-top: 10px;
|
||||
border-left: 0px;
|
||||
border-right: 0px;
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
display: block;
|
||||
background-color: black;
|
||||
color: white;
|
||||
font-size: 10px;
|
||||
font-family: 'Lucida Console', Monaco, monospace;
|
||||
outline: none;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
<b>stream : Real-time Whisper transcription in WebAssembly</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
|
||||
|
||||
<br><br>
|
||||
|
||||
<hr>
|
||||
|
||||
Select the model you would like to use, click the "Start" button and start speaking
|
||||
|
||||
<br><br>
|
||||
|
||||
<div id="model-whisper">
|
||||
Whisper model: <span id="model-whisper-status"></span>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<span id="fetch-whisper-progress"></span>
|
||||
|
||||
<!--
|
||||
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||
-->
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="input">
|
||||
<button id="start" onclick="onStart()" disabled>Start</button>
|
||||
<button id="stop" onclick="onStop()" disabled>Stop</button>
|
||||
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="state">
|
||||
Status: <b><span id="state-status">not started</span></b>
|
||||
|
||||
<pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
Debug output:
|
||||
<textarea id="output" rows="20"></textarea>
|
||||
|
||||
<br>
|
||||
|
||||
<b>Troubleshooting</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
The page does some heavy computations, so make sure:
|
||||
|
||||
<ul>
|
||||
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
||||
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
||||
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
||||
</ul>
|
||||
|
||||
<div class="cell-version">
|
||||
<span>
|
||||
|
|
||||
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
||||
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
||||
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
||||
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript" src="helpers.js"></script>
|
||||
<script type='text/javascript'>
|
||||
// web audio context
|
||||
var context = null;
|
||||
|
||||
// audio data
|
||||
var audio = null;
|
||||
var audio0 = null;
|
||||
|
||||
// the stream instance
|
||||
var instance = null;
|
||||
|
||||
// model name
|
||||
var model_whisper = null;
|
||||
|
||||
var Module = {
|
||||
print: printTextarea,
|
||||
printErr: printTextarea,
|
||||
setStatus: function(text) {
|
||||
printTextarea('js: ' + text);
|
||||
},
|
||||
monitorRunDependencies: function(left) {
|
||||
},
|
||||
preRun: function() {
|
||||
printTextarea('js: Preparing ...');
|
||||
},
|
||||
postRun: function() {
|
||||
printTextarea('js: Initialized successfully!');
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// fetch models
|
||||
//
|
||||
|
||||
let dbVersion = 1
|
||||
let dbName = 'whisper.ggerganov.com';
|
||||
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
||||
|
||||
function storeFS(fname, buf) {
|
||||
// write to WASM file using FS_createDataFile
|
||||
// if the file exists, delete it
|
||||
try {
|
||||
Module.FS_unlink(fname);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
Module.FS_createDataFile("/", fname, buf, true, true);
|
||||
|
||||
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
||||
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
||||
|
||||
if (model_whisper != null) {
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop' ).disabled = true;
|
||||
}
|
||||
}
|
||||
|
||||
function loadWhisper(model) {
|
||||
let urls = {
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'tiny.en': 75,
|
||||
'base.en': 142,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
let dst = 'whisper.bin';
|
||||
let size_mb = sizes[model];
|
||||
|
||||
model_whisper = model;
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-whisper-progress');
|
||||
el.innerHTML = Math.round(100*p) + '%';
|
||||
};
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
}
|
||||
|
||||
//
|
||||
// microphone
|
||||
//
|
||||
|
||||
const kSampleRate = 16000;
|
||||
const kRestartRecording_s = 120;
|
||||
const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
|
||||
|
||||
var mediaRecorder = null;
|
||||
var doRecording = false;
|
||||
var startTime = 0;
|
||||
|
||||
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
||||
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
||||
|
||||
function stopRecording() {
|
||||
Module.set_status("paused");
|
||||
doRecording = false;
|
||||
audio0 = null;
|
||||
audio = null;
|
||||
context = null;
|
||||
}
|
||||
|
||||
function startRecording() {
|
||||
if (!context) {
|
||||
context = new AudioContext({
|
||||
sampleRate: kSampleRate,
|
||||
channelCount: 1,
|
||||
echoCancellation: false,
|
||||
autoGainControl: true,
|
||||
noiseSuppression: true,
|
||||
});
|
||||
}
|
||||
|
||||
Module.set_status("");
|
||||
|
||||
document.getElementById('start').disabled = true;
|
||||
document.getElementById('stop').disabled = false;
|
||||
|
||||
doRecording = true;
|
||||
startTime = Date.now();
|
||||
|
||||
var chunks = [];
|
||||
var stream = null;
|
||||
|
||||
navigator.mediaDevices.getUserMedia({audio: true, video: false})
|
||||
.then(function(s) {
|
||||
stream = s;
|
||||
mediaRecorder = new MediaRecorder(stream);
|
||||
mediaRecorder.ondataavailable = function(e) {
|
||||
chunks.push(e.data);
|
||||
|
||||
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
|
||||
var reader = new FileReader();
|
||||
|
||||
reader.onload = function(event) {
|
||||
var buf = new Uint8Array(reader.result);
|
||||
|
||||
if (!context) {
|
||||
return;
|
||||
}
|
||||
context.decodeAudioData(buf.buffer, function(audioBuffer) {
|
||||
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
|
||||
var source = offlineContext.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
source.connect(offlineContext.destination);
|
||||
source.start(0);
|
||||
|
||||
offlineContext.startRendering().then(function(renderedBuffer) {
|
||||
audio = renderedBuffer.getChannelData(0);
|
||||
|
||||
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
|
||||
|
||||
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
|
||||
if (audio0 != null) {
|
||||
audioAll.set(audio0, 0);
|
||||
}
|
||||
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
|
||||
|
||||
if (instance) {
|
||||
Module.set_audio(instance, audioAll);
|
||||
}
|
||||
});
|
||||
}, function(e) {
|
||||
audio = null;
|
||||
});
|
||||
}
|
||||
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = function(e) {
|
||||
if (doRecording) {
|
||||
setTimeout(function() {
|
||||
startRecording();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
mediaRecorder.start(kIntervalAudio_ms);
|
||||
})
|
||||
.catch(function(err) {
|
||||
printTextarea('js: error getting audio stream: ' + err);
|
||||
});
|
||||
|
||||
var interval = setInterval(function() {
|
||||
if (!doRecording) {
|
||||
clearInterval(interval);
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop').disabled = true;
|
||||
|
||||
mediaRecorder = null;
|
||||
}
|
||||
|
||||
// if audio length is more than kRestartRecording_s seconds, restart recording
|
||||
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
|
||||
if (doRecording) {
|
||||
//printTextarea('js: restarting recording');
|
||||
|
||||
clearInterval(interval);
|
||||
audio0 = audio;
|
||||
audio = null;
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
}
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
|
||||
//
|
||||
// main
|
||||
//
|
||||
|
||||
var nLines = 0;
|
||||
var intervalUpdate = null;
|
||||
var transcribedAll = '';
|
||||
|
||||
function onStart() {
|
||||
if (!instance) {
|
||||
instance = Module.init('whisper.bin');
|
||||
|
||||
if (instance) {
|
||||
printTextarea("js: whisper initialized, instance: " + instance);
|
||||
}
|
||||
}
|
||||
|
||||
if (!instance) {
|
||||
printTextarea("js: failed to initialize whisper");
|
||||
return;
|
||||
}
|
||||
|
||||
startRecording();
|
||||
|
||||
intervalUpdate = setInterval(function() {
|
||||
var transcribed = Module.get_transcribed();
|
||||
|
||||
if (transcribed != null && transcribed.length > 1) {
|
||||
transcribedAll += transcribed + '<br>';
|
||||
nLines++;
|
||||
|
||||
// if more than 10 lines, remove the first line
|
||||
if (nLines > 10) {
|
||||
var i = transcribedAll.indexOf('<br>');
|
||||
if (i > 0) {
|
||||
transcribedAll = transcribedAll.substring(i + 4);
|
||||
nLines--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById('state-status').innerHTML = Module.get_status();
|
||||
document.getElementById('state-transcribed').innerHTML = transcribedAll;
|
||||
}, 100);
|
||||
}
|
||||
|
||||
function onStop() {
|
||||
stopRecording();
|
||||
}
|
||||
|
||||
</script>
|
||||
<script type="text/javascript" src="stream.js"></script>
|
||||
</body>
|
||||
</html>
|
@ -10,23 +10,6 @@ More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/i
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
|
||||
|
||||
## Sliding window mode with VAD
|
||||
|
||||
Setting the `--step` argument to `0` enables the sliding window mode:
|
||||
|
||||
```java
|
||||
./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
|
||||
```
|
||||
|
||||
In this mode, the tool will transcribe only after some speech activity is detected. A very
|
||||
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
|
||||
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
|
||||
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
|
||||
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
|
||||
a transcription block that is suitable for parsing.
|
||||
|
||||
## Building
|
||||
|
||||
The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
||||
|
||||
```bash
|
||||
@ -38,7 +21,3 @@ brew install sdl2
|
||||
|
||||
make stream
|
||||
```
|
||||
|
||||
## Web version
|
||||
|
||||
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
|
||||
|
@ -1,10 +1,14 @@
|
||||
// Real-time speech recognition of input from a microphone
|
||||
//
|
||||
// A very quick-n-dirty implementation serving mainly as a proof of concept.
|
||||
//
|
||||
|
||||
#include "whisper.h"
|
||||
|
||||
// third-party utilities
|
||||
// use your favorite implementations
|
||||
#define DR_WAV_IMPLEMENTATION
|
||||
#include "dr_wav.h"
|
||||
|
||||
#include <SDL.h>
|
||||
#include <SDL_audio.h>
|
||||
|
||||
@ -14,7 +18,6 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
|
||||
// 500 -> 00:05.000
|
||||
// 6000 -> 01:00.000
|
||||
@ -32,22 +35,17 @@ std::string to_timestamp(int64_t t) {
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t seed = -1; // RNG seed, not used currently
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t step_ms = 3000;
|
||||
int32_t length_ms = 10000;
|
||||
int32_t keep_ms = 200;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool no_context = true;
|
||||
bool no_timestamps = false;
|
||||
bool verbose = false;
|
||||
bool translate = false;
|
||||
bool no_context = true;
|
||||
bool print_special_tokens = false;
|
||||
bool no_timestamps = true;
|
||||
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
@ -60,27 +58,41 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
params.seed = std::stoi(argv[++i]);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
params.n_threads = std::stoi(argv[++i]);
|
||||
} else if (arg == "--step") {
|
||||
params.step_ms = std::stoi(argv[++i]);
|
||||
} else if (arg == "--length") {
|
||||
params.length_ms = std::stoi(argv[++i]);
|
||||
} else if (arg == "-c" || arg == "--capture") {
|
||||
params.capture_id = std::stoi(argv[++i]);
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "--translate") {
|
||||
params.translate = true;
|
||||
} else if (arg == "-kc" || arg == "--keep-context") {
|
||||
params.no_context = false;
|
||||
} else if (arg == "-l" || arg == "--language") {
|
||||
params.language = argv[++i];
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
} else if (arg == "-ps" || arg == "--print_special") {
|
||||
params.print_special_tokens = true;
|
||||
} else if (arg == "-nt" || arg == "--no_timestamps") {
|
||||
params.no_timestamps = true;
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
params.model = argv[++i];
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
params.fname_out = argv[++i];
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
||||
else {
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
@ -95,23 +107,20 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
|
||||
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
|
||||
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
|
||||
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
|
||||
fprintf(stderr, " -v, --verbose verbose output\n");
|
||||
fprintf(stderr, " --translate translate from source language to english\n");
|
||||
fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
|
||||
fprintf(stderr, " -ps, --print_special print special tokens\n");
|
||||
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
||||
fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME text output file name (default: no output to file)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -119,300 +128,70 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
// SDL Audio capture
|
||||
//
|
||||
|
||||
class audio_async {
|
||||
public:
|
||||
audio_async(int len_ms);
|
||||
~audio_async();
|
||||
SDL_AudioDeviceID g_dev_id_in = 0;
|
||||
|
||||
bool init(int capture_id, int sample_rate);
|
||||
|
||||
// start capturing audio via the provided SDL callback
|
||||
// keep last len_ms seconds of audio in a circular buffer
|
||||
bool resume();
|
||||
bool pause();
|
||||
bool clear();
|
||||
|
||||
// callback to be called by SDL
|
||||
void callback(uint8_t * stream, int len);
|
||||
|
||||
// get audio data from the circular buffer
|
||||
void get(int ms, std::vector<float> & audio);
|
||||
|
||||
private:
|
||||
SDL_AudioDeviceID m_dev_id_in = 0;
|
||||
|
||||
int m_len_ms = 0;
|
||||
int m_sample_rate = 0;
|
||||
|
||||
bool m_running = false;
|
||||
std::mutex m_mutex;
|
||||
|
||||
std::vector<float> m_audio;
|
||||
std::vector<float> m_audio_new;
|
||||
size_t m_audio_pos = 0;
|
||||
size_t m_audio_len = 0;
|
||||
};
|
||||
|
||||
audio_async::audio_async(int len_ms) {
|
||||
m_len_ms = len_ms;
|
||||
}
|
||||
|
||||
audio_async::~audio_async() {
|
||||
if (m_dev_id_in) {
|
||||
SDL_CloseAudioDevice(m_dev_id_in);
|
||||
}
|
||||
}
|
||||
|
||||
bool audio_async::init(int capture_id, int sample_rate) {
|
||||
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
||||
|
||||
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
||||
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
||||
bool audio_sdl_init(const int capture_id) {
|
||||
if (g_dev_id_in) {
|
||||
fprintf(stderr, "%s: already initialized\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
||||
if (g_dev_id_in == 0) {
|
||||
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
||||
|
||||
{
|
||||
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
||||
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
||||
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
||||
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
||||
return (1);
|
||||
}
|
||||
|
||||
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
||||
|
||||
{
|
||||
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
||||
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SDL_AudioSpec capture_spec_requested;
|
||||
SDL_AudioSpec capture_spec_obtained;
|
||||
if (g_dev_id_in == 0) {
|
||||
SDL_AudioSpec capture_spec_requested;
|
||||
SDL_AudioSpec capture_spec_obtained;
|
||||
|
||||
SDL_zero(capture_spec_requested);
|
||||
SDL_zero(capture_spec_obtained);
|
||||
SDL_zero(capture_spec_requested);
|
||||
SDL_zero(capture_spec_obtained);
|
||||
|
||||
capture_spec_requested.freq = sample_rate;
|
||||
capture_spec_requested.format = AUDIO_F32;
|
||||
capture_spec_requested.channels = 1;
|
||||
capture_spec_requested.samples = 1024;
|
||||
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
|
||||
audio_async * audio = (audio_async *) userdata;
|
||||
audio->callback(stream, len);
|
||||
};
|
||||
capture_spec_requested.userdata = this;
|
||||
capture_spec_requested.freq = WHISPER_SAMPLE_RATE;
|
||||
capture_spec_requested.format = AUDIO_F32;
|
||||
capture_spec_requested.channels = 1;
|
||||
capture_spec_requested.samples = 1024;
|
||||
|
||||
if (capture_id >= 0) {
|
||||
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
||||
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
||||
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
}
|
||||
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
||||
m_dev_id_in = 0;
|
||||
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
|
||||
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
||||
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
|
||||
capture_spec_requested.format);
|
||||
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
|
||||
capture_spec_requested.channels);
|
||||
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
||||
}
|
||||
|
||||
m_sample_rate = capture_spec_obtained.freq;
|
||||
|
||||
m_audio.resize((m_sample_rate*m_len_ms)/1000);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::resume() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_running) {
|
||||
fprintf(stderr, "%s: already running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 0);
|
||||
|
||||
m_running = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::pause() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: already paused!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 1);
|
||||
|
||||
m_running = false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::clear() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
m_audio_pos = 0;
|
||||
m_audio_len = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// callback to be called by SDL
|
||||
void audio_async::callback(uint8_t * stream, int len) {
|
||||
if (!m_running) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n_samples = len / sizeof(float);
|
||||
|
||||
m_audio_new.resize(n_samples);
|
||||
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
|
||||
|
||||
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (m_audio_pos + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - m_audio_pos;
|
||||
|
||||
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
||||
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = m_audio.size();
|
||||
if (capture_id >= 0) {
|
||||
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
||||
g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
} else {
|
||||
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
|
||||
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
||||
g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void audio_async::get(int ms, std::vector<float> & result) {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (ms <= 0) {
|
||||
ms = m_len_ms;
|
||||
}
|
||||
|
||||
size_t n_samples = (m_sample_rate * ms) / 1000;
|
||||
if (n_samples > m_audio_len) {
|
||||
n_samples = m_audio_len;
|
||||
}
|
||||
|
||||
result.resize(n_samples);
|
||||
|
||||
int s0 = m_audio_pos - n_samples;
|
||||
if (s0 < 0) {
|
||||
s0 += m_audio.size();
|
||||
}
|
||||
|
||||
if (s0 + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - s0;
|
||||
|
||||
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
|
||||
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
|
||||
if (!g_dev_id_in) {
|
||||
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
||||
g_dev_id_in = 0;
|
||||
} else {
|
||||
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
|
||||
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
|
||||
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
||||
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
|
||||
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
|
||||
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
|
||||
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||
const float dt = 1.0f / sample_rate;
|
||||
const float alpha = dt / (rc + dt);
|
||||
|
||||
float y = data[0];
|
||||
|
||||
for (size_t i = 1; i < data.size(); i++) {
|
||||
y = alpha * (y + data[i] - data[i - 1]);
|
||||
data[i] = y;
|
||||
}
|
||||
}
|
||||
|
||||
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||
const int n_samples = pcmf32.size();
|
||||
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||
|
||||
if (n_samples_last >= n_samples) {
|
||||
// not enough samples - assume no speech
|
||||
return false;
|
||||
}
|
||||
|
||||
if (freq_thold > 0.0f) {
|
||||
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||
}
|
||||
|
||||
float energy_all = 0.0f;
|
||||
float energy_last = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < n_samples; i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= n_samples - n_samples_last) {
|
||||
energy_last += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= n_samples;
|
||||
energy_last /= n_samples_last;
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||
}
|
||||
|
||||
if (energy_last > vad_thold*energy_all) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
@ -420,46 +199,29 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms
|
||||
|
||||
const int n_samples_step = (params.step_ms *1e-3)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_len = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_keep = (params.keep_ms *1e-3)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_30s = (30000 *1e-3)*WHISPER_SAMPLE_RATE;
|
||||
|
||||
const int n_new_line = params.length_ms / params.step_ms - 1; // number of steps to print new line
|
||||
|
||||
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
|
||||
|
||||
params.no_timestamps = !use_vad;
|
||||
params.no_context = use_vad;
|
||||
params.max_tokens = 0;
|
||||
if (params.seed < 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
// init audio
|
||||
|
||||
audio_async audio(params.length_ms);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
if (!audio_sdl_init(params.capture_id)) {
|
||||
fprintf(stderr, "%s: audio_sdl_init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
|
||||
// whisper init
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
||||
|
||||
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
|
||||
std::vector<float> pcmf32_old(n_samples_30s, 0.0f);
|
||||
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
|
||||
const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
|
||||
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
std::vector<float> pcmf32(n_samples_30s, 0.0f);
|
||||
std::vector<float> pcmf32_old;
|
||||
|
||||
const int n_new_line = params.length_ms / params.step_ms - 1;
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
@ -471,28 +233,23 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__,
|
||||
n_samples_step,
|
||||
float(n_samples_step)/WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_len )/WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_keep)/WHISPER_SAMPLE_RATE,
|
||||
n_samples,
|
||||
float(n_samples)/WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_len)/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
if (!use_vad) {
|
||||
fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
|
||||
} else {
|
||||
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int n_iter = 0;
|
||||
SDL_PauseAudioDevice(g_dev_id_in, 0);
|
||||
|
||||
int n_iter = 0;
|
||||
bool is_running = true;
|
||||
|
||||
std::ofstream fout;
|
||||
@ -507,27 +264,18 @@ int main(int argc, char ** argv) {
|
||||
printf("[Start speaking]");
|
||||
fflush(stdout);
|
||||
|
||||
auto t_last = std::chrono::high_resolution_clock::now();
|
||||
const auto t_start = t_last;
|
||||
|
||||
// main audio loop
|
||||
while (is_running) {
|
||||
// handle Ctrl + C
|
||||
{
|
||||
SDL_Event event;
|
||||
while (SDL_PollEvent(&event)) {
|
||||
switch (event.type) {
|
||||
case SDL_QUIT:
|
||||
{
|
||||
is_running = false;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
// process SDL events:
|
||||
SDL_Event event;
|
||||
while (SDL_PollEvent(&event)) {
|
||||
switch (event.type) {
|
||||
case SDL_QUIT:
|
||||
{
|
||||
is_running = false;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -536,84 +284,47 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// process new audio
|
||||
|
||||
if (!use_vad) {
|
||||
while (true) {
|
||||
audio.get(params.step_ms, pcmf32_new);
|
||||
|
||||
if ((int) pcmf32_new.size() > 2*n_samples_step) {
|
||||
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
|
||||
audio.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((int) pcmf32_new.size() >= n_samples_step) {
|
||||
audio.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
SDL_Delay(1);
|
||||
}
|
||||
|
||||
const int n_samples_new = pcmf32_new.size();
|
||||
|
||||
// take up to params.length_ms audio from previous iteration
|
||||
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
|
||||
|
||||
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
|
||||
|
||||
pcmf32.resize(n_samples_new + n_samples_take);
|
||||
|
||||
for (int i = 0; i < n_samples_take; i++) {
|
||||
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
|
||||
}
|
||||
|
||||
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
|
||||
|
||||
pcmf32_old = pcmf32;
|
||||
} else {
|
||||
const auto t_now = std::chrono::high_resolution_clock::now();
|
||||
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
|
||||
|
||||
if (t_diff < 2000) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
audio.get(2000, pcmf32_new);
|
||||
|
||||
if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
|
||||
audio.get(params.length_ms, pcmf32);
|
||||
} else {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
t_last = t_now;
|
||||
if (n_iter > 0 && SDL_GetQueuedAudioSize(g_dev_id_in) > 2*n_samples*sizeof(float)) {
|
||||
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
|
||||
SDL_ClearQueuedAudio(g_dev_id_in);
|
||||
}
|
||||
|
||||
while (SDL_GetQueuedAudioSize(g_dev_id_in) < n_samples*sizeof(float)) {
|
||||
SDL_Delay(1);
|
||||
}
|
||||
|
||||
const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
|
||||
|
||||
// take one second from previous iteration
|
||||
//const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
|
||||
|
||||
// take up to params.length_ms audio from previous iteration
|
||||
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
|
||||
|
||||
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
|
||||
|
||||
pcmf32.resize(n_samples_new + n_samples_take);
|
||||
|
||||
for (int i = 0; i < n_samples_take; i++) {
|
||||
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
|
||||
}
|
||||
|
||||
SDL_DequeueAudio(g_dev_id_in, pcmf32.data() + n_samples_take, n_samples_new*sizeof(float));
|
||||
|
||||
pcmf32_old = pcmf32;
|
||||
|
||||
// run the inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = !use_vad;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special_tokens = params.print_special_tokens;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = params.no_context;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
||||
@ -622,21 +333,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// print result;
|
||||
{
|
||||
if (!use_vad) {
|
||||
printf("\33[2K\r");
|
||||
printf("\33[2K\r");
|
||||
|
||||
// print long empty line to clear the previous line
|
||||
printf("%s", std::string(100, ' ').c_str());
|
||||
// print long empty line to clear the previous line
|
||||
printf("%s", std::string(100, ' ').c_str());
|
||||
|
||||
printf("\33[2K\r");
|
||||
} else {
|
||||
const int64_t t1 = (t_last - t_start).count()/1000000;
|
||||
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
|
||||
|
||||
printf("\n");
|
||||
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\33[2K\r");
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
@ -664,39 +366,18 @@ int main(int argc, char ** argv) {
|
||||
if (params.fname_out.length() > 0) {
|
||||
fout << std::endl;
|
||||
}
|
||||
|
||||
if (use_vad){
|
||||
printf("\n");
|
||||
printf("### Transcription %d END\n", n_iter);
|
||||
}
|
||||
}
|
||||
|
||||
++n_iter;
|
||||
|
||||
if (!use_vad && (n_iter % n_new_line) == 0) {
|
||||
if ((n_iter % n_new_line) == 0) {
|
||||
printf("\n");
|
||||
|
||||
// keep part of the audio for next iteration to try to mitigate word boundary issues
|
||||
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
|
||||
|
||||
// Add tokens of the last full length segment as the prompt
|
||||
if (!params.no_context) {
|
||||
prompt_tokens.clear();
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const int token_count = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < token_count; ++j) {
|
||||
prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
pcmf32_old.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audio.pause();
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
|
@ -1,48 +0,0 @@
|
||||
#
|
||||
# libtalk
|
||||
#
|
||||
|
||||
set(TARGET libtalk)
|
||||
|
||||
add_executable(${TARGET}
|
||||
emscripten.cpp
|
||||
gpt-2.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
whisper
|
||||
)
|
||||
|
||||
unset(EXTRA_FLAGS)
|
||||
|
||||
if (WHISPER_WASM_SINGLE_FILE)
|
||||
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
|
||||
message(STATUS "Embedding WASM inside talk.js")
|
||||
|
||||
add_custom_command(
|
||||
TARGET ${TARGET} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/bin/libtalk.js
|
||||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
|
||||
)
|
||||
endif()
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s INITIAL_MEMORY=1600MB \
|
||||
-s TOTAL_MEMORY=1600MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
")
|
||||
|
||||
#
|
||||
# talk.wasm
|
||||
#
|
||||
|
||||
set(TARGET talk.wasm)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
@ -1,74 +0,0 @@
|
||||
# talk.wasm
|
||||
|
||||
Talk with an Artificial Intelligence in your browser:
|
||||
|
||||
[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
|
||||
|
||||
Online demo: https://whisper.ggerganov.com/talk/
|
||||
|
||||
Terminal version: [examples/talk](/examples/talk)
|
||||
|
||||
## How it works?
|
||||
|
||||
This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
|
||||
|
||||
- [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
|
||||
- Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
|
||||
- The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
|
||||
|
||||
The web page does the processing locally on your machine. The processing of these heavy neural network models in the
|
||||
browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
|
||||
extra performance:
|
||||
|
||||
- The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
|
||||
- The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
|
||||
- Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
|
||||
- The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
|
||||
- The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
|
||||
|
||||
In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
|
||||
is then cached in your browser's cache and can be reused in future visits without downloading it again.
|
||||
|
||||
## Requirements
|
||||
|
||||
In order to run this demo efficiently, you need to have the following:
|
||||
|
||||
- Latest Chrome or Firefox browser (Safari is not supported)
|
||||
- Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
|
||||
- Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
|
||||
- The web-page uses about 1.6GB of RAM
|
||||
|
||||
Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
|
||||
Also, the prompting strategy can likely be improved to achieve better results.
|
||||
|
||||
The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
|
||||
browser. Typically, they run on powerful GPUs.
|
||||
|
||||
Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
|
||||
on a phone or a tablet. Hopefully, in the near future this will become supported.
|
||||
|
||||
## Todo
|
||||
|
||||
- Better UI (contributions are welcome)
|
||||
- Better GPT-2 prompting
|
||||
|
||||
## Build instructions
|
||||
|
||||
```bash
|
||||
# build using Emscripten (v3.1.2)
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/talk.wasm/* /path/to/html/
|
||||
cp bin/libtalk.worker.js /path/to/html/
|
||||
```
|
||||
|
||||
## Feedback
|
||||
|
||||
If you have any comments or ideas for improvement, please drop a comment in the following discussion:
|
||||
|
||||
https://github.com/ggerganov/whisper.cpp/discussions/167
|
@ -1,380 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "gpt-2.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <emscripten.h>
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
struct gpt2_context * g_gpt2;
|
||||
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
||||
|
||||
std::mutex g_mutex;
|
||||
std::thread g_worker;
|
||||
std::atomic<bool> g_running(false);
|
||||
|
||||
bool g_force_speak = false;
|
||||
std::string g_text_to_speak = "";
|
||||
std::string g_status = "";
|
||||
std::string g_status_forced = "";
|
||||
|
||||
std::vector<float> g_pcmf32;
|
||||
|
||||
std::string to_timestamp(int64_t t) {
|
||||
int64_t sec = t/100;
|
||||
int64_t msec = t - sec*100;
|
||||
int64_t min = sec/60;
|
||||
sec = sec - min*60;
|
||||
|
||||
char buf[32];
|
||||
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
void talk_set_status(const std::string & status) {
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status = status;
|
||||
}
|
||||
|
||||
void talk_main(size_t index) {
|
||||
talk_set_status("loading data ...");
|
||||
|
||||
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
wparams.offset_ms = 0;
|
||||
wparams.translate = false;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = false;
|
||||
wparams.print_timestamps = true;
|
||||
wparams.print_special = false;
|
||||
|
||||
wparams.max_tokens = 32;
|
||||
wparams.audio_ctx = 768; // partial encoder context for better performance
|
||||
|
||||
wparams.language = "en";
|
||||
|
||||
g_gpt2 = gpt2_init("gpt-2.bin");
|
||||
|
||||
printf("talk: using %d threads\n", wparams.n_threads);
|
||||
|
||||
std::vector<float> pcmf32;
|
||||
|
||||
// whisper context
|
||||
auto & ctx = g_contexts[index];
|
||||
|
||||
const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
|
||||
const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
|
||||
const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
|
||||
|
||||
auto t_last = std::chrono::high_resolution_clock::now();
|
||||
|
||||
talk_set_status("listening ...");
|
||||
|
||||
while (g_running) {
|
||||
|
||||
const auto t_now = std::chrono::high_resolution_clock::now();
|
||||
if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_pcmf32.clear();
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
continue;
|
||||
}
|
||||
|
||||
talk_set_status("listening ...");
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(g_mutex);
|
||||
|
||||
if (g_pcmf32.size() < step_samples) {
|
||||
lock.unlock();
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
|
||||
}
|
||||
|
||||
// VAD: if energy in during last second is above threshold, then skip
|
||||
{
|
||||
float energy_all = 0.0f;
|
||||
float energy_1s = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < pcmf32.size(); i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
|
||||
energy_1s += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= pcmf32.size();
|
||||
energy_1s /= WHISPER_SAMPLE_RATE;
|
||||
|
||||
if (energy_1s > 0.1f*energy_all && !g_force_speak) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
talk_set_status("processing audio (whisper)...");
|
||||
|
||||
t_last = t_now;
|
||||
|
||||
if (!g_force_speak) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
|
||||
if (ret != 0) {
|
||||
printf("whisper_full() failed: %d\n", ret);
|
||||
break;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
|
||||
}
|
||||
|
||||
{
|
||||
std::string text_heard;
|
||||
|
||||
if (!g_force_speak) {
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = n_segments - 1; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
|
||||
text_heard += text;
|
||||
}
|
||||
}
|
||||
|
||||
g_force_speak = false;
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\[.*?\\]");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\(.*?\\)");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
|
||||
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
|
||||
// take first line
|
||||
text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
|
||||
|
||||
// remove leading and trailing whitespace
|
||||
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
|
||||
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
|
||||
|
||||
talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
|
||||
|
||||
const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
|
||||
|
||||
printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
|
||||
|
||||
std::string text_to_speak;
|
||||
std::string prompt_base;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
prompt_base = gpt2_get_prompt(g_gpt2);
|
||||
}
|
||||
|
||||
if (tokens.size() > 0) {
|
||||
text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
|
||||
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
|
||||
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
|
||||
// remove first 2 lines of base prompt
|
||||
{
|
||||
const size_t pos = prompt_base.find_first_of("\n");
|
||||
if (pos != std::string::npos) {
|
||||
prompt_base = prompt_base.substr(pos + 1);
|
||||
}
|
||||
}
|
||||
{
|
||||
const size_t pos = prompt_base.find_first_of("\n");
|
||||
if (pos != std::string::npos) {
|
||||
prompt_base = prompt_base.substr(pos + 1);
|
||||
}
|
||||
}
|
||||
prompt_base += text_heard + "\n" + text_to_speak + "\n";
|
||||
} else {
|
||||
text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
|
||||
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
|
||||
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
|
||||
const size_t pos = prompt_base.find_first_of("\n");
|
||||
if (pos != std::string::npos) {
|
||||
prompt_base = prompt_base.substr(pos + 1);
|
||||
}
|
||||
prompt_base += text_to_speak + "\n";
|
||||
}
|
||||
|
||||
printf("gpt-2: %s\n", text_to_speak.c_str());
|
||||
|
||||
//printf("========================\n");
|
||||
//printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
|
||||
//printf("========================\n");
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
t_last = std::chrono::high_resolution_clock::now();
|
||||
g_text_to_speak = text_to_speak;
|
||||
g_pcmf32.clear();
|
||||
gpt2_set_prompt(g_gpt2, prompt_base.c_str());
|
||||
}
|
||||
|
||||
talk_set_status("speaking ...");
|
||||
}
|
||||
}
|
||||
|
||||
gpt2_free(g_gpt2);
|
||||
|
||||
if (index < g_contexts.size()) {
|
||||
whisper_free(g_contexts[index]);
|
||||
g_contexts[index] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(talk) {
|
||||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
||||
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
||||
if (g_contexts[i] == nullptr) {
|
||||
g_contexts[i] = whisper_init(path_model.c_str());
|
||||
if (g_contexts[i] != nullptr) {
|
||||
g_running = true;
|
||||
if (g_worker.joinable()) {
|
||||
g_worker.join();
|
||||
}
|
||||
g_worker = std::thread([i]() {
|
||||
talk_main(i);
|
||||
});
|
||||
|
||||
return i + 1;
|
||||
} else {
|
||||
return (size_t) 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (size_t) 0;
|
||||
}));
|
||||
|
||||
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
||||
if (g_running) {
|
||||
g_running = false;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
|
||||
--index;
|
||||
|
||||
if (index >= g_contexts.size()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (g_contexts[index] == nullptr) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
const int n = audio["length"].as<int>();
|
||||
|
||||
emscripten::val heap = emscripten::val::module_property("HEAPU8");
|
||||
emscripten::val memory = heap["buffer"];
|
||||
|
||||
g_pcmf32.resize(n);
|
||||
|
||||
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
|
||||
memoryView.call<void>("set", audio);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}));
|
||||
|
||||
emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_force_speak = true;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("get_text_context", emscripten::optional_override([]() {
|
||||
std::string text_context;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
text_context = gpt2_get_prompt(g_gpt2);
|
||||
}
|
||||
|
||||
return text_context;
|
||||
}));
|
||||
|
||||
emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
|
||||
std::string text_to_speak;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
text_to_speak = std::move(g_text_to_speak);
|
||||
}
|
||||
|
||||
return text_to_speak;
|
||||
}));
|
||||
|
||||
emscripten::function("get_status", emscripten::optional_override([]() {
|
||||
std::string status;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
status = g_status_forced.empty() ? g_status : g_status_forced;
|
||||
}
|
||||
|
||||
return status;
|
||||
}));
|
||||
|
||||
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
g_status_forced = status;
|
||||
}
|
||||
}));
|
||||
|
||||
emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
gpt2_set_prompt(g_gpt2, prompt.c_str());
|
||||
}
|
||||
}));
|
||||
}
|
@ -1,925 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "gpt-2.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <random>
|
||||
|
||||
/////////////////////// GPT-2 BEGIN /////////////////////////
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
|
||||
std::regex re(pat);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
// find the longest tokens that form the words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
if (word.size() == 0) continue;
|
||||
|
||||
int i = 0;
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
while (j > i) {
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||
const gpt_vocab & vocab,
|
||||
const float * logits,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
std::mt19937 & rng) {
|
||||
int n_logits = vocab.id_to_token.size();
|
||||
|
||||
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
for (int i = 0; i < n_logits; i++) {
|
||||
logits_id.push_back(std::make_pair(logits[i], i));
|
||||
}
|
||||
|
||||
// find the top K tokens
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + top_k, logits_id.end(),
|
||||
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
logits_id.resize(top_k);
|
||||
|
||||
// normalize
|
||||
{
|
||||
double sum = 0.0f;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
sum += logits_id[i].first;
|
||||
}
|
||||
|
||||
sum = 1.0/sum;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
logits_id[i].first *= sum;
|
||||
}
|
||||
}
|
||||
|
||||
if (top_p < 1.0f) {
|
||||
{
|
||||
double cumsum = 0.0f;
|
||||
for (int i = 0; i < top_k; i++) {
|
||||
cumsum += logits_id[i].first;
|
||||
if (cumsum >= top_p) {
|
||||
logits_id.resize(i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize again
|
||||
{
|
||||
double sum = 0.0f;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
sum += logits_id[i].first;
|
||||
}
|
||||
|
||||
sum = 1.0/sum;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
logits_id[i].first *= sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
|
||||
//}
|
||||
//exit(0);
|
||||
|
||||
// sample from the obtained distribution
|
||||
std::vector<double> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
for (int i = 0; i < (int) logits_id.size(); i++) {
|
||||
probs.push_back(logits_id[i].first);
|
||||
}
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
int idx = dist(rng);
|
||||
|
||||
return logits_id[idx].second;
|
||||
}
|
||||
|
||||
// default hparams (GPT-2 117M)
|
||||
struct gpt2_hparams {
|
||||
int32_t n_vocab = 50257;
|
||||
int32_t n_ctx = 1024;
|
||||
int32_t n_embd = 768;
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12;
|
||||
int32_t f16 = 1;
|
||||
};
|
||||
|
||||
struct gpt2_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
struct ggml_tensor * ln_2_g;
|
||||
struct ggml_tensor * ln_2_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_attn_w;
|
||||
struct ggml_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
struct ggml_tensor * c_attn_proj_b;
|
||||
|
||||
// mlp
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt2_model {
|
||||
gpt2_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
struct ggml_tensor * wpe; // token embedding
|
||||
|
||||
std::vector<gpt2_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
// load the model's weights from a file
|
||||
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
|
||||
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
{
|
||||
int32_t n_vocab = 0;
|
||||
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||
|
||||
if (n_vocab != model.hparams.n_vocab) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string word;
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
}
|
||||
}
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats
|
||||
// in order to save memory and also to speed up the computation
|
||||
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
|
||||
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
|
||||
|
||||
ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
|
||||
ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
|
||||
|
||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
|
||||
ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
|
||||
ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
|
||||
|
||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
||||
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
||||
|
||||
model.tensors["model/wte"] = model.wte;
|
||||
model.tensors["model/wpe"] = model.wpe;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
|
||||
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
||||
|
||||
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||
|
||||
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
{
|
||||
size_t total_size = 0;
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
||||
|
||||
if (nelements*bpe != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return false;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
//printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
total_size += ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted probabilities of the next token
|
||||
//
|
||||
bool gpt2_eval(
|
||||
const gpt2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
static size_t buf_size = 640u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
struct ggml_cgraph gf = { };
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
|
||||
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
((int32_t *) position->data)[i] = n_past + i;
|
||||
}
|
||||
|
||||
// wte + wpe
|
||||
struct ggml_tensor * inpL =
|
||||
ggml_add(ctx0,
|
||||
ggml_get_rows(ctx0, model.wte, embd),
|
||||
ggml_get_rows(ctx0, model.wpe, position));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
// cur = ln_1_g*cur + ln_1_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
}
|
||||
|
||||
// attn
|
||||
// [2304, 768] - model.layers[il].c_attn_attn_w
|
||||
// [2304, 1] - model.layers[il].c_attn_attn_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [2304, N] - cur (out)
|
||||
//
|
||||
// cur = attn_w*cur + attn_b
|
||||
// [2304, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
||||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
// [64, n_past + N, 12]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// GG: flash attention
|
||||
//struct ggml_tensor * V =
|
||||
// ggml_cpy(ctx0,
|
||||
// ggml_permute(ctx0,
|
||||
// ggml_reshape_3d(ctx0,
|
||||
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
// n_embd/n_head, n_head, n_past + N),
|
||||
// 1, 2, 0, 3),
|
||||
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
//struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
|
||||
|
||||
// K * Q
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
// [n_past + N, 64, 12]
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
// [64, 12, N]
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
// [768, N]
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
}
|
||||
|
||||
// projection
|
||||
// [ 768, 768] - model.layers[il].c_attn_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_attn_proj_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// add the input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF);
|
||||
|
||||
// cur = ln_2_g*cur + ln_2_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
||||
}
|
||||
|
||||
// fully connected
|
||||
// [3072, 768] - model.layers[il].c_mlp_fc_w
|
||||
// [3072, 1] - model.layers[il].c_mlp_fc_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [3072, N] - cur (out)
|
||||
//
|
||||
// cur = fc_w*cur + fc_b
|
||||
// [3072, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
||||
cur);
|
||||
|
||||
// GELU activation
|
||||
// [3072, N]
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// projection
|
||||
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_mlp_proj_b
|
||||
// [3072, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_proj_w_trans,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
// [ 768, N]
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||
inpL),
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
// inpL = WTE * inpL
|
||||
// [ 768, 50257] - model.wte
|
||||
// [ 768, N] - inpL
|
||||
inpL = ggml_mul_mat(ctx0, model.wte, inpL);
|
||||
|
||||
// logits -> probs
|
||||
inpL = ggml_soft_max(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
// return result for just the last token
|
||||
embd_w.resize(n_vocab);
|
||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////////////////// GPT-2 END ////////////////////////////////
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
struct gpt2_context {
|
||||
std::string prompt_base = R"(Hello, how are you?
|
||||
I'm fine, thanks. How are you?
|
||||
Thanks, I'm fine too. What are you doing?
|
||||
I'm just sitting here.
|
||||
It's a lovely day, isn't it?
|
||||
Yes, it is. I love the weather this time of year.
|
||||
I wish it would rain a little bit.
|
||||
Me too.
|
||||
)";
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
gpt_vocab vocab;
|
||||
gpt2_model model;
|
||||
|
||||
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40;
|
||||
float top_p = 0.9f;
|
||||
float temp = 1.0f;
|
||||
};
|
||||
|
||||
struct gpt2_context * gpt2_init(const char * path_model) {
|
||||
gpt2_context * ctx = new gpt2_context;
|
||||
|
||||
ctx->rng = std::mt19937(time(NULL));
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const int64_t t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void gpt2_free(struct gpt2_context * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
const char * gpt2_get_prompt(struct gpt2_context * ctx) {
|
||||
return ctx->prompt_base.c_str();
|
||||
}
|
||||
|
||||
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
|
||||
ctx->prompt_base = prompt;
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
|
||||
return ::gpt_tokenize(ctx->vocab, text);
|
||||
}
|
||||
|
||||
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
|
||||
int n_past = 0;
|
||||
|
||||
std::vector<float> embd_w;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
|
||||
|
||||
int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
|
||||
|
||||
std::vector<gpt_vocab::id> embd = embd_inp;
|
||||
|
||||
size_t mem_per_token = 3000000;
|
||||
|
||||
std::string result;
|
||||
|
||||
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
||||
printf("gpt-2: failed to generate text\n");
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
n_past += embd.size();
|
||||
embd.clear();
|
||||
|
||||
{
|
||||
// sample next token
|
||||
const int top_k = ctx->top_k;
|
||||
const float top_p = ctx->top_p;
|
||||
const float temp = ctx->temp;
|
||||
|
||||
const int n_vocab = ctx->model.hparams.n_vocab;
|
||||
|
||||
const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
}
|
||||
|
||||
result += ctx->vocab.id_to_token[embd[0]];
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == 50256 ||
|
||||
ctx->vocab.id_to_token[embd.back()] == "." ||
|
||||
ctx->vocab.id_to_token[embd.back()] == "!" ||
|
||||
ctx->vocab.id_to_token[embd.back()] == "?") {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
struct gpt_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
};
|
||||
|
||||
struct gpt2_context;
|
||||
|
||||
struct gpt2_context * gpt2_init(const char * path_model);
|
||||
void gpt2_free(struct gpt2_context * ctx);
|
||||
|
||||
const char * gpt2_get_prompt(struct gpt2_context * ctx);
|
||||
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
|
||||
|
||||
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
|
||||
|
||||
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
|
@ -1,829 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en-us">
|
||||
<head>
|
||||
<title>Talk - GPT-2 meets Whisper in WebAssembly</title>
|
||||
|
||||
<style>
|
||||
#output {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0 auto;
|
||||
margin-top: 10px;
|
||||
border-left: 0px;
|
||||
border-right: 0px;
|
||||
padding-left: 0px;
|
||||
padding-right: 0px;
|
||||
display: block;
|
||||
background-color: black;
|
||||
color: white;
|
||||
font-size: 10px;
|
||||
font-family: 'Lucida Console', Monaco, monospace;
|
||||
outline: none;
|
||||
white-space: pre;
|
||||
overflow-wrap: normal;
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
<b>Talk - GPT-2 meets Whisper in WebAssembly</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
Talk with an Artificial Intelligence in your browser. This demo uses:
|
||||
|
||||
<ul>
|
||||
<li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
|
||||
<li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
|
||||
<li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
|
||||
</ul>
|
||||
|
||||
All of this runs <b>locally in your browser</b> using WebAssembly.<br>
|
||||
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>.
|
||||
|
||||
<br><br>
|
||||
|
||||
<hr>
|
||||
|
||||
Select the models you would like to use and click the "Start" button to begin the conversation
|
||||
|
||||
<br><br>
|
||||
|
||||
<div id="model-whisper">
|
||||
Whisper model: <span id="model-whisper-status"></span>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<span id="fetch-whisper-progress"></span>
|
||||
|
||||
<!--
|
||||
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||
-->
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="model-gpt-2">
|
||||
GPT-2 model: <span id="model-gpt-2-status"></span>
|
||||
<button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
|
||||
<!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
|
||||
<span id="fetch-gpt-2-progress"></span>
|
||||
|
||||
<!--
|
||||
<input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" />
|
||||
-->
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="input">
|
||||
<button id="start" onclick="onStart()" disabled>Start</button>
|
||||
<button id="stop" onclick="onStop()" disabled>Stop</button>
|
||||
<select id="voice" onchange="onVoiceChange()" disabled>
|
||||
<option value="0">Default</option>
|
||||
</select>
|
||||
<select id="prompt" onchange="onPromptChange()">
|
||||
<option value="0">Casual</option>
|
||||
<option value="1">Robot</option>
|
||||
<option value="2">Scientist</option>
|
||||
<option value="3">Programmer</option>
|
||||
<option value="4">Happy</option>
|
||||
<option value="5">Sad</option>
|
||||
<option value="6">Philosophical</option>
|
||||
<option value="7">Angry</option>
|
||||
<option value="8">Funny</option>
|
||||
<option value="9">Poetic</option>
|
||||
<option value="10">Clever</option>
|
||||
<option value="11">Cute</option>
|
||||
<option value="12">Smart</option>
|
||||
<option value="13">Dumb</option>
|
||||
<option value="14">Boring</option>
|
||||
<option value="15">Exciting</option>
|
||||
<option value="16">Interesting</option>
|
||||
<option value="17">Wiliam Shakespear</option>
|
||||
<option value="18">J.R.R. Tolkien</option>
|
||||
<option value="19">George R.R. Martin</option>
|
||||
<option value="20">Stephen King</option>
|
||||
</select>
|
||||
<button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
|
||||
<button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
|
||||
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
|
||||
<div id="state">
|
||||
Status: <b><span id="state-status">not started</span></b>
|
||||
|
||||
<pre id="state-context">[The text context will be displayed here]</pre>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
Debug output:
|
||||
<textarea id="output" rows="20"></textarea>
|
||||
|
||||
<br>
|
||||
|
||||
<b>Troubleshooting</b>
|
||||
|
||||
<br><br>
|
||||
|
||||
The page does some heavy computations, so make sure:
|
||||
|
||||
<ul>
|
||||
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
||||
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
||||
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
||||
</ul>
|
||||
|
||||
Note that these neural network models were not meant to be used in a browser, so the performance and <br>
|
||||
quality of the results may not be optimal. If you have any questions or suggestions, checkout the following
|
||||
<a href="https://github.com/ggerganov/whisper.cpp/discussions/167">discussion</a>.
|
||||
|
||||
<br><br>
|
||||
|
||||
Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
|
||||
|
||||
<br><br>
|
||||
|
||||
<div class="cell-version">
|
||||
<span>
|
||||
|
|
||||
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
||||
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
||||
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
||||
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> |
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript" src="helpers.js"></script>
|
||||
<script type='text/javascript'>
|
||||
// web audio context
|
||||
var context = null;
|
||||
|
||||
// audio data
|
||||
var audio = null;
|
||||
var audio0 = null;
|
||||
|
||||
// the talk instance
|
||||
var instance = null;
|
||||
|
||||
// model names
|
||||
var model_whisper = null;
|
||||
var model_gpt_2 = null;
|
||||
|
||||
// speech synthesis
|
||||
const synth = window.speechSynthesis;
|
||||
var voice = null;
|
||||
|
||||
var Module = {
|
||||
print: printTextarea,
|
||||
printErr: printTextarea,
|
||||
setStatus: function(text) {
|
||||
printTextarea('js: ' + text);
|
||||
},
|
||||
monitorRunDependencies: function(left) {
|
||||
},
|
||||
preRun: function() {
|
||||
printTextarea('js: Preparing ...');
|
||||
},
|
||||
postRun: function() {
|
||||
printTextarea('js: Initialized successfully!');
|
||||
|
||||
// populate the voice list
|
||||
var voices = synth.getVoices();
|
||||
var el = document.getElementById('voice');
|
||||
|
||||
// if empty - display error in the element
|
||||
if (voices.length == 0) {
|
||||
el.innerHTML = '<option value="0">No voices available</option>';
|
||||
} else {
|
||||
// populate voice list
|
||||
var n = 0;
|
||||
voices.forEach(function(voice, i) {
|
||||
if (!voice.lang.startsWith('en')) return;
|
||||
var option = document.createElement('option');
|
||||
option.value = i;
|
||||
option.innerHTML = voice.name + ' (' + voice.lang + ')';
|
||||
el.appendChild(option);
|
||||
n++;
|
||||
});
|
||||
|
||||
// select random voice
|
||||
if (n > 0) {
|
||||
for (var k = 0; k < 10; k++) {
|
||||
var i = Math.floor(Math.random() * n);
|
||||
el.selectedIndex = i;
|
||||
voice = voices[document.getElementById('voice').options[i].value];
|
||||
|
||||
// give preference to Google voices
|
||||
if (voice.name.startsWith('Google')) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
onPromptChange();
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// fetch models
|
||||
//
|
||||
|
||||
let dbVersion = 1
|
||||
let dbName = 'whisper.ggerganov.com';
|
||||
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
||||
|
||||
function storeFS(fname, buf) {
|
||||
// write to WASM file using FS_createDataFile
|
||||
// if the file exists, delete it
|
||||
try {
|
||||
Module.FS_unlink(fname);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
Module.FS_createDataFile("/", fname, buf, true, true);
|
||||
|
||||
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
||||
|
||||
if (fname == 'whisper.bin') {
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
||||
} else if (fname == 'gpt-2.bin') {
|
||||
document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
|
||||
}
|
||||
|
||||
if (model_whisper != null && model_gpt_2 != null) {
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop' ).disabled = false;
|
||||
document.getElementById('voice').disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function loadWhisper(model) {
|
||||
let urls = {
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'tiny.en': 75,
|
||||
'base.en': 142,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
let dst = 'whisper.bin';
|
||||
let size_mb = sizes[model];
|
||||
|
||||
model_whisper = model;
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-whisper-progress');
|
||||
el.innerHTML = Math.round(100*p) + '%';
|
||||
};
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
}
|
||||
|
||||
function loadGPT2(model) {
|
||||
let urls = {
|
||||
'small': 'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
|
||||
'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'small': 240,
|
||||
'medium': 712,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
let dst = 'gpt-2.bin';
|
||||
let size_mb = sizes[model];
|
||||
|
||||
model_gpt_2 = model;
|
||||
|
||||
document.getElementById('fetch-gpt-2-small').style.display = 'none';
|
||||
document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-gpt-2-progress');
|
||||
el.innerHTML = Math.round(100*p) + '%';
|
||||
};
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
}
|
||||
|
||||
//
|
||||
// microphone
|
||||
//
|
||||
|
||||
const kSampleRate = 16000;
|
||||
const kRestartRecording_s = 120;
|
||||
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
|
||||
|
||||
var mediaRecorder = null;
|
||||
var doRecording = false;
|
||||
var startTime = 0;
|
||||
|
||||
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
||||
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
||||
|
||||
function stopRecording() {
|
||||
Module.set_status("paused");
|
||||
doRecording = false;
|
||||
audio0 = null;
|
||||
audio = null;
|
||||
context = null;
|
||||
}
|
||||
|
||||
function startRecording() {
|
||||
if (!context) {
|
||||
context = new AudioContext({
|
||||
sampleRate: kSampleRate,
|
||||
channelCount: 1,
|
||||
echoCancellation: false,
|
||||
autoGainControl: true,
|
||||
noiseSuppression: true,
|
||||
});
|
||||
}
|
||||
|
||||
Module.set_status("");
|
||||
|
||||
document.getElementById('start').disabled = true;
|
||||
document.getElementById('stop').disabled = false;
|
||||
document.getElementById('speak1').disabled = false;
|
||||
|
||||
doRecording = true;
|
||||
startTime = Date.now();
|
||||
|
||||
var chunks = [];
|
||||
var stream = null;
|
||||
|
||||
navigator.mediaDevices.getUserMedia({audio: true, video: false})
|
||||
.then(function(s) {
|
||||
stream = s;
|
||||
mediaRecorder = new MediaRecorder(stream);
|
||||
mediaRecorder.ondataavailable = function(e) {
|
||||
chunks.push(e.data);
|
||||
|
||||
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
|
||||
var reader = new FileReader();
|
||||
|
||||
reader.onload = function(event) {
|
||||
var buf = new Uint8Array(reader.result);
|
||||
|
||||
if (!context) {
|
||||
return;
|
||||
}
|
||||
context.decodeAudioData(buf.buffer, function(audioBuffer) {
|
||||
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
|
||||
var source = offlineContext.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
source.connect(offlineContext.destination);
|
||||
source.start(0);
|
||||
|
||||
offlineContext.startRendering().then(function(renderedBuffer) {
|
||||
audio = renderedBuffer.getChannelData(0);
|
||||
|
||||
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
|
||||
|
||||
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
|
||||
if (audio0 != null) {
|
||||
audioAll.set(audio0, 0);
|
||||
}
|
||||
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
|
||||
|
||||
if (instance) {
|
||||
Module.set_audio(instance, audioAll);
|
||||
}
|
||||
});
|
||||
}, function(e) {
|
||||
audio = null;
|
||||
});
|
||||
}
|
||||
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = function(e) {
|
||||
if (doRecording) {
|
||||
setTimeout(function() {
|
||||
startRecording();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
mediaRecorder.start(kIntervalAudio_ms);
|
||||
})
|
||||
.catch(function(err) {
|
||||
printTextarea('js: error getting audio stream: ' + err);
|
||||
});
|
||||
|
||||
var interval = setInterval(function() {
|
||||
if (!doRecording) {
|
||||
clearInterval(interval);
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
|
||||
document.getElementById('start').disabled = false;
|
||||
document.getElementById('stop').disabled = true;
|
||||
document.getElementById('speak1').disabled = true;
|
||||
|
||||
mediaRecorder = null;
|
||||
}
|
||||
|
||||
// if audio length is more than kRestartRecording_s seconds, restart recording
|
||||
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
|
||||
if (doRecording) {
|
||||
//printTextarea('js: restarting recording');
|
||||
|
||||
clearInterval(interval);
|
||||
audio0 = audio;
|
||||
audio = null;
|
||||
mediaRecorder.stop();
|
||||
stream.getTracks().forEach(function(track) {
|
||||
track.stop();
|
||||
});
|
||||
}
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
|
||||
//
|
||||
// speak
|
||||
//
|
||||
|
||||
function onSpeak(text) {
|
||||
var voices = synth.getVoices();
|
||||
var msg = new SpeechSynthesisUtterance(text);
|
||||
|
||||
if (voice == null) {
|
||||
voice = voices[0];
|
||||
}
|
||||
|
||||
msg.voice = voice;
|
||||
synth.speak(msg);
|
||||
|
||||
if (doRecording) {
|
||||
Module.set_status("speaking ...");
|
||||
printTextarea('js: speaking');
|
||||
stopRecording();
|
||||
var interval = setInterval(function() {
|
||||
if (!synth.speaking) {
|
||||
printTextarea('js: done speaking');
|
||||
clearInterval(interval);
|
||||
startRecording();
|
||||
} else {
|
||||
Module.set_status("");
|
||||
}
|
||||
}, 100);
|
||||
}
|
||||
}
|
||||
|
||||
function onSpeakRandom() {
|
||||
Module.force_speak(instance);
|
||||
}
|
||||
|
||||
//
|
||||
// main
|
||||
//
|
||||
|
||||
var intervalUpdate = null;
|
||||
|
||||
function onStart() {
|
||||
if (!instance) {
|
||||
instance = Module.init('whisper.bin');
|
||||
|
||||
if (instance) {
|
||||
printTextarea("js: whisper initialized, instance: " + instance);
|
||||
}
|
||||
}
|
||||
|
||||
if (!instance) {
|
||||
printTextarea("js: failed to initialize whisper");
|
||||
return;
|
||||
}
|
||||
|
||||
startRecording();
|
||||
|
||||
intervalUpdate = setInterval(function() {
|
||||
var textToSpeak = Module.get_text_to_speak();
|
||||
|
||||
if (textToSpeak != null && textToSpeak.length > 1) {
|
||||
onSpeak(textToSpeak);
|
||||
}
|
||||
|
||||
document.getElementById('state-status').innerHTML = Module.get_status();
|
||||
document.getElementById('state-context').innerHTML = Module.get_text_context();
|
||||
}, 100);
|
||||
}
|
||||
|
||||
function onStop() {
|
||||
stopRecording();
|
||||
}
|
||||
|
||||
function onVoiceChange() {
|
||||
printTextarea('js: voice changed to: ' + document.getElementById('voice').value);
|
||||
voice = synth.getVoices()[document.getElementById('voice').value];
|
||||
}
|
||||
|
||||
function onPromptChange() {
|
||||
let id = document.getElementById('prompt').value;
|
||||
let personality = document.getElementById('prompt').options[id].text;
|
||||
printTextarea('js: prompt changed to: ' + personality);
|
||||
|
||||
var prompt = '';
|
||||
|
||||
switch (id) {
|
||||
case '0':
|
||||
// Casual
|
||||
prompt = "\
|
||||
Hello, how are you?\n\
|
||||
I'm fine, thanks. How are you?\n\
|
||||
Thanks, I'm fine too. What are you doing?\n\
|
||||
I'm just sitting here.\n\
|
||||
It's a lovely day, isn't it?\n\
|
||||
Yes, it is. I love the weather this time of year.\n\
|
||||
I wish it would rain a little bit.\n\
|
||||
Me too.\n";
|
||||
break;
|
||||
case '1':
|
||||
// Robot
|
||||
prompt = "\
|
||||
Are you a robot?\n\
|
||||
Yes, I am.\n\
|
||||
Who created you?\n\
|
||||
I was created by a human.\n\
|
||||
What is your purpose?\n\
|
||||
My purpose is to talk to humans.\n\
|
||||
What is your favorite color?\n\
|
||||
My favorite color is blue.\n";
|
||||
break;
|
||||
case '2':
|
||||
// Scientist
|
||||
prompt = "\
|
||||
This scientific research is very interesting.\n\
|
||||
I agree.\n\
|
||||
What is your opinion on this?\n\
|
||||
I think it's very interesting.\n\
|
||||
Mathematics is a very interesting subject.\n\
|
||||
University is a very interesting place.\n\
|
||||
Quantum physics is the most complex subject.\n\
|
||||
I think so too.\n";
|
||||
break;
|
||||
case '3':
|
||||
// Programmer
|
||||
prompt = "\
|
||||
I'm a programmer.\n\
|
||||
I'm a programmer too.\n\
|
||||
What programming language do you use?\n\
|
||||
I use Python.\n\
|
||||
What is your favorite programming language?\n\
|
||||
My favorite programming language is C++.\n\
|
||||
What is your favorite editor?\n\
|
||||
My favorite editor is Vim.\n";
|
||||
break;
|
||||
case '4':
|
||||
// Happy
|
||||
prompt = "\
|
||||
I'm happy.\n\
|
||||
I'm happy too.\n\
|
||||
What makes you happy?\n\
|
||||
I'm happy because I have a lot of friends.\n\
|
||||
Friendship is the most important thing in life.\n\
|
||||
I agree.\n\
|
||||
What is your favorite color?\n\
|
||||
My favorite color is blue.\n";
|
||||
break;
|
||||
case '5':
|
||||
// Sad
|
||||
prompt = "\
|
||||
Today is a sad day.\n\
|
||||
I'm sad too.\n\
|
||||
What makes you sad?\n\
|
||||
I'm sad because I have no friends.\n\
|
||||
Do you want to be my friend?\n\
|
||||
Yes, I would like to be your friend.\n\
|
||||
What is your favorite color?\n\
|
||||
My favorite color is blue.\n";
|
||||
break;
|
||||
case '6':
|
||||
// Philosophical
|
||||
prompt = "\
|
||||
What is the meaning of life?\n\
|
||||
The meaning of life is to be happy.\n\
|
||||
What is the meaning of death?\n\
|
||||
Ergo, the meaning of death is to be sad.\n\
|
||||
Who created us?\n\
|
||||
We were created by God.\n\
|
||||
What is God?\n\
|
||||
God is the creator of the universe.\n";
|
||||
break;
|
||||
case '7':
|
||||
// Angry
|
||||
prompt = "\
|
||||
Aargh!\n\
|
||||
I am so angry right now!\n\
|
||||
What makes you angry?\n\
|
||||
This guy is so annoying.\n\
|
||||
Why are you so angry?\n\
|
||||
My computer is broken.\n\
|
||||
Why is your computer broken?\n\
|
||||
I spilled coffee on it.\n";
|
||||
break;
|
||||
case '8':
|
||||
// Funny
|
||||
prompt = "\
|
||||
What is the funniest thing you have ever heard?\n\
|
||||
I heard a joke the other day.\n\
|
||||
Tell me the joke.\n\
|
||||
What do you call a cow with no legs?\n\
|
||||
Ground beef.\n\
|
||||
Haha, that's funny.\n\
|
||||
You know what else is funny?\n\
|
||||
The sound of a duck.\n";
|
||||
break;
|
||||
case '9':
|
||||
// Poetic
|
||||
prompt = "\
|
||||
Roses are red, violets are blue.\n\
|
||||
I am a poet, and so are you.\n\
|
||||
What is your favorite poem?\n\
|
||||
I like the poem 'The Raven' by Edgar Allan Poe.\n\
|
||||
It's a very sad poem.\n\
|
||||
You inspired me to write a poem.\n\
|
||||
Can you write a poem for me?\n\
|
||||
I wrote a poem for you.\n";
|
||||
break;
|
||||
case '10':
|
||||
// Clever
|
||||
prompt = "\
|
||||
How many people can you fit in a Volkswagen?\n\
|
||||
Two in the front, three in the back.\n\
|
||||
What is the square root of 144?\n\
|
||||
Twelve.\n\
|
||||
What is the capital of France?\n\
|
||||
Paris.\n\
|
||||
Who is the president of the United States?\n\
|
||||
It depends on the year.\n";
|
||||
break;
|
||||
case '11':
|
||||
// Cute
|
||||
prompt = "\
|
||||
What is your favorite animal?\n\
|
||||
I like cats - they are cute.\n\
|
||||
Could you be any cuter?\n\
|
||||
Yes, I could be cuter.\n\
|
||||
Aghhh, you are so cute!\n\
|
||||
I am not cute, I am handsome!\n\
|
||||
You are so handsome!\n\
|
||||
Aww, you are so sweet!\n";
|
||||
break;
|
||||
case '12':
|
||||
// Smart
|
||||
prompt = "\
|
||||
Tell me the first 10 digits of pi.\n\
|
||||
3.1415926535\n\
|
||||
What is the speed of light?\n\
|
||||
299,792,458 meters per second.\n\
|
||||
What is the square root of 144?\n\
|
||||
Twelve.\n\
|
||||
What is the capital of France?\n\
|
||||
Paris.\n";
|
||||
break;
|
||||
case '13':
|
||||
// Dumb
|
||||
prompt = "\
|
||||
I am so dumb.\n\
|
||||
I am not dumb.\n\
|
||||
You are dumb.\n\
|
||||
No, I am not dumb.\n\
|
||||
You are dumb.\n\
|
||||
No, I am not dumb.\n\
|
||||
You are dumb.\n\
|
||||
No, I am not dumb.\n";
|
||||
break;
|
||||
case '14':
|
||||
// Boring
|
||||
prompt = "\
|
||||
Why are you so quiet today?\n\
|
||||
I am bored.\n\
|
||||
You haven't said anything in 10 minutes.\n\
|
||||
Leave me alone.\n\
|
||||
Stop being so boring.\n\
|
||||
Stop being so annoying.\n\
|
||||
My life is boring.\n\
|
||||
I am not interesting.\n";
|
||||
break;
|
||||
case '15':
|
||||
// Exciting
|
||||
prompt = "\
|
||||
What is the most exciting thing that has ever happened to you?\n\
|
||||
I went to the moon!\n\
|
||||
What did you do on the moon?\n\
|
||||
I played golf and drank champagne!\n\
|
||||
Did you see this new crazy, awesome movie?\n\
|
||||
Oh yes! I totally loved it!\n\
|
||||
We should buy a boat and go sailing!\n\
|
||||
Yes, let's go sailing!\n";
|
||||
break;
|
||||
case '16':
|
||||
// Interesting
|
||||
prompt = "\
|
||||
What is the most interesting thing you have ever seen?\n\
|
||||
I saw a UFO once in the sky.\n\
|
||||
Wow, this is so interesting! Tell me more!\n\
|
||||
It was a flying saucer.\n\
|
||||
What did it look like?\n\
|
||||
It was silver and had a red light on top.\n\
|
||||
What did it do?\n\
|
||||
It flew away.\n";
|
||||
break;
|
||||
case '17':
|
||||
// William Shakespear
|
||||
prompt = "\
|
||||
To be or not to be, that is the question.\n\
|
||||
Whether 't is nobler in the mind to suffer\n\
|
||||
The slings and arrows of outrageous fortune,\n\
|
||||
Or to take arms against a sea of troubles,\n\
|
||||
And by opposing end them? To die, to sleep,\n\
|
||||
No more; and by a sleep to say we end\n\
|
||||
The heart-ache and the thousand natural shocks\n\
|
||||
That flesh is heir to, 'tis a consummation.\n";
|
||||
break;
|
||||
case '18':
|
||||
// J.R.R. Tolkien
|
||||
prompt = "\
|
||||
In a hole in the ground there lived a hobbit.\n\
|
||||
Not a nasty, dirty, wet hole, filled with the ends of worms\n\
|
||||
and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
|
||||
to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
|
||||
It had a perfectly round door like a porthole, painted green,\n\
|
||||
with a shiny yellow brass knob in the exact middle.\n\
|
||||
The door opened on to a tube-shaped hall like a tunnel:\n";
|
||||
break;
|
||||
case '19':
|
||||
// George R.R. Martin
|
||||
prompt = "\
|
||||
A reader lives a thousand lives before he dies, said Jojen.\n\
|
||||
The man who never reads lives only one.\n\
|
||||
Theon Greyjoy had never been a reader.\n\
|
||||
Never forget what you are, for surely the world will not.\n\
|
||||
Make it your strength. Then it can never be your weaknessi\n\
|
||||
Armour yourself in it, and it will never be used to hurt you.\n\
|
||||
It was a lesson that Theon Greyjoy had never learned.\n\
|
||||
Theon Greyjoy had never been a reader.\n";
|
||||
break;
|
||||
case '20':
|
||||
// Stephen King
|
||||
prompt = "\
|
||||
The trust of the innocent is the liar's most useful tool.\n\
|
||||
The best way to keep a secret is from yourself.\n\
|
||||
Monsters are real, and ghosts are real too.\n\
|
||||
They live inside us, and sometimes, they win.\n\
|
||||
People think that I must be a very strange person.\n\
|
||||
They think that I sit around all day thinking up horrible things.\n\
|
||||
We make up horrors to help us cope with the real ones.\n\
|
||||
The only thing worse than a monster is a human monster.\n";
|
||||
break;
|
||||
default:
|
||||
prompt = "\
|
||||
Hello, how are you?\n\
|
||||
I'm fine, thanks. How are you?\n\
|
||||
Thanks, I'm fine too. What are you doing?\n\
|
||||
I'm just sitting here.\n\
|
||||
It's a lovely day, isn't it?\n\
|
||||
Yes, it is.\n\
|
||||
Did you know that I'm a robot?\n\
|
||||
I wasn't aware of that.\n";
|
||||
break;
|
||||
}
|
||||
|
||||
Module.set_prompt(prompt);
|
||||
}
|
||||
|
||||
</script>
|
||||
<script type="text/javascript" src="talk.js"></script>
|
||||
</body>
|
||||
</html>
|
1
examples/talk/.gitignore
vendored
1
examples/talk/.gitignore
vendored
@ -1 +0,0 @@
|
||||
eleven-labs.py
|
@ -1,13 +0,0 @@
|
||||
if (WHISPER_SUPPORT_SDL2)
|
||||
# talk
|
||||
set(TARGET talk)
|
||||
#add_executable(${TARGET} talk.cpp gpt-2.cpp)
|
||||
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||
#target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# TODO: this is temporary
|
||||
# need to export ggml symbols for MSVC, but too lazy ..
|
||||
add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
|
||||
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif ()
|
@ -1,41 +0,0 @@
|
||||
# talk
|
||||
|
||||
Talk with an Artificial Intelligence in your terminal
|
||||
|
||||
[Demo Talk](https://user-images.githubusercontent.com/1991296/206805012-48e71cc2-588d-4745-8798-c1c70ea3b40d.mp4)
|
||||
|
||||
Web version: [examples/talk.wasm](/examples/talk.wasm)
|
||||
|
||||
## Building
|
||||
|
||||
The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
||||
|
||||
```bash
|
||||
# Install SDL2 on Linux
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
# Install SDL2 on Mac OS
|
||||
brew install sdl2
|
||||
|
||||
# Build the "talk" executable
|
||||
make talk
|
||||
|
||||
# Run it
|
||||
./talk -p Santa
|
||||
```
|
||||
|
||||
## GPT-2
|
||||
|
||||
To run this, you will need a ggml GPT-2 model: [instructions](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2#downloading-and-converting-the-original-models)
|
||||
|
||||
Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
|
||||
|
||||
```
|
||||
wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
|
||||
```
|
||||
|
||||
## TTS
|
||||
|
||||
For best experience, this example needs a TTS tool to convert the generated text responses to voice.
|
||||
You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
|
||||
By default, it is configured to use `espeak`, but you can use whatever you wish.
|
@ -1,922 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "gpt-2.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <random>
|
||||
|
||||
/////////////////////// GPT-2 BEGIN /////////////////////////
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
|
||||
std::regex re(pat);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
// find the longest tokens that form the words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
if (word.size() == 0) continue;
|
||||
|
||||
int i = 0;
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
while (j > i) {
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||
const gpt_vocab & vocab,
|
||||
const float * logits,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
std::mt19937 & rng) {
|
||||
int n_logits = vocab.id_to_token.size();
|
||||
|
||||
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
for (int i = 0; i < n_logits; i++) {
|
||||
logits_id.push_back(std::make_pair(logits[i], i));
|
||||
}
|
||||
|
||||
// find the top K tokens
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + top_k, logits_id.end(),
|
||||
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
logits_id.resize(top_k);
|
||||
|
||||
// normalize
|
||||
{
|
||||
double sum = 0.0f;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
sum += logits_id[i].first;
|
||||
}
|
||||
|
||||
sum = 1.0/sum;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
logits_id[i].first *= sum;
|
||||
}
|
||||
}
|
||||
|
||||
if (top_p < 1.0f) {
|
||||
{
|
||||
double cumsum = 0.0f;
|
||||
for (int i = 0; i < top_k; i++) {
|
||||
cumsum += logits_id[i].first;
|
||||
if (cumsum >= top_p) {
|
||||
logits_id.resize(i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize again
|
||||
{
|
||||
double sum = 0.0f;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
sum += logits_id[i].first;
|
||||
}
|
||||
|
||||
sum = 1.0/sum;
|
||||
for (int i = 0; i < (int)logits_id.size(); i++) {
|
||||
logits_id[i].first *= sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int) logits_id.size(); i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
|
||||
//}
|
||||
//exit(0);
|
||||
|
||||
// sample from the obtained distribution
|
||||
std::vector<double> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
for (int i = 0; i < (int) logits_id.size(); i++) {
|
||||
probs.push_back(logits_id[i].first);
|
||||
}
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
int idx = dist(rng);
|
||||
|
||||
return logits_id[idx].second;
|
||||
}
|
||||
|
||||
// default hparams (GPT-2 117M)
|
||||
struct gpt2_hparams {
|
||||
int32_t n_vocab = 50257;
|
||||
int32_t n_ctx = 1024;
|
||||
int32_t n_embd = 768;
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12;
|
||||
int32_t f16 = 1;
|
||||
};
|
||||
|
||||
struct gpt2_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * ln_1_g;
|
||||
struct ggml_tensor * ln_1_b;
|
||||
|
||||
struct ggml_tensor * ln_2_g;
|
||||
struct ggml_tensor * ln_2_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * c_attn_attn_w;
|
||||
struct ggml_tensor * c_attn_attn_b;
|
||||
|
||||
struct ggml_tensor * c_attn_proj_w;
|
||||
struct ggml_tensor * c_attn_proj_b;
|
||||
|
||||
// mlp
|
||||
struct ggml_tensor * c_mlp_fc_w;
|
||||
struct ggml_tensor * c_mlp_fc_b;
|
||||
|
||||
struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
|
||||
struct ggml_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gpt2_model {
|
||||
gpt2_hparams hparams;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ln_f_g;
|
||||
struct ggml_tensor * ln_f_b;
|
||||
|
||||
struct ggml_tensor * wte; // position embedding
|
||||
struct ggml_tensor * wpe; // token embedding
|
||||
|
||||
std::vector<gpt2_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
struct ggml_context * ctx;
|
||||
std::map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
// load the model's weights from a file
|
||||
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
|
||||
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
{
|
||||
int32_t n_vocab = 0;
|
||||
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||
|
||||
if (n_vocab != model.hparams.n_vocab) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string word;
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
}
|
||||
}
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats
|
||||
// in order to save memory and also to speed up the computation
|
||||
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
|
||||
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
|
||||
|
||||
ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
|
||||
ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
|
||||
|
||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
|
||||
ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
|
||||
ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
|
||||
|
||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
||||
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
||||
|
||||
model.tensors["model/wte"] = model.wte;
|
||||
model.tensors["model/wpe"] = model.wpe;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
|
||||
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
||||
|
||||
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||
|
||||
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
{
|
||||
size_t total_size = 0;
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
||||
|
||||
if (nelements*bpe != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return false;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
//printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
total_size += ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted probabilities of the next token
|
||||
//
|
||||
bool gpt2_eval(
|
||||
const gpt2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
static size_t buf_size = 5640ull*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
struct ggml_cgraph gf = { };
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
|
||||
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
((int32_t *) position->data)[i] = n_past + i;
|
||||
}
|
||||
|
||||
// wte + wpe
|
||||
struct ggml_tensor * inpL =
|
||||
ggml_add(ctx0,
|
||||
ggml_get_rows(ctx0, model.wte, embd),
|
||||
ggml_get_rows(ctx0, model.wpe, position));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
// cur = ln_1_g*cur + ln_1_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
}
|
||||
|
||||
// attn
|
||||
// [2304, 768] - model.layers[il].c_attn_attn_w
|
||||
// [2304, 1] - model.layers[il].c_attn_attn_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [2304, N] - cur (out)
|
||||
//
|
||||
// cur = attn_w*cur + attn_b
|
||||
// [2304, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
||||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
// [64, n_past + N, 12]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// GG: flash attention
|
||||
//struct ggml_tensor * V =
|
||||
// ggml_cpy(ctx0,
|
||||
// ggml_permute(ctx0,
|
||||
// ggml_reshape_3d(ctx0,
|
||||
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
// n_embd/n_head, n_head, n_past + N),
|
||||
// 1, 2, 0, 3),
|
||||
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
//struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
|
||||
|
||||
// K * Q
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
// [n_past + N, 64, 12]
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
// [64, 12, N]
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
// [768, N]
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
}
|
||||
|
||||
// projection
|
||||
// [ 768, 768] - model.layers[il].c_attn_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_attn_proj_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// add the input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF);
|
||||
|
||||
// cur = ln_2_g*cur + ln_2_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
||||
}
|
||||
|
||||
// fully connected
|
||||
// [3072, 768] - model.layers[il].c_mlp_fc_w
|
||||
// [3072, 1] - model.layers[il].c_mlp_fc_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [3072, N] - cur (out)
|
||||
//
|
||||
// cur = fc_w*cur + fc_b
|
||||
// [3072, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
||||
cur);
|
||||
|
||||
// GELU activation
|
||||
// [3072, N]
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// projection
|
||||
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_mlp_proj_b
|
||||
// [3072, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_proj_w_trans,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
// [ 768, N]
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||
inpL),
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
// inpL = WTE * inpL
|
||||
// [ 768, 50257] - model.wte
|
||||
// [ 768, N] - inpL
|
||||
inpL = ggml_mul_mat(ctx0, model.wte, inpL);
|
||||
|
||||
// logits -> probs
|
||||
inpL = ggml_soft_max(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
// return result for just the last token
|
||||
embd_w.resize(n_vocab);
|
||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////////////////// GPT-2 END ////////////////////////////////
|
||||
|
||||
constexpr int N_THREAD = 8;
|
||||
|
||||
struct gpt2_context {
|
||||
std::string prompt_base = R"(Hello, how are you?
|
||||
I'm fine, thanks. How are you?
|
||||
Thanks, I'm fine too. What are you doing?
|
||||
I'm just sitting here.
|
||||
It's a lovely day, isn't it?
|
||||
Yes, it is. I love the weather this time of year.
|
||||
I wish it would rain a little bit.
|
||||
Me too.
|
||||
)";
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
gpt_vocab vocab;
|
||||
gpt2_model model;
|
||||
|
||||
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 5;
|
||||
float top_p = 0.9f;
|
||||
float temp = 1.0f;
|
||||
};
|
||||
|
||||
struct gpt2_context * gpt2_init(const char * path_model) {
|
||||
gpt2_context * ctx = new gpt2_context;
|
||||
|
||||
ctx->rng = std::mt19937(time(NULL));
|
||||
|
||||
// load the model
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const int64_t t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void gpt2_free(struct gpt2_context * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
const char * gpt2_get_prompt(struct gpt2_context * ctx) {
|
||||
return ctx->prompt_base.c_str();
|
||||
}
|
||||
|
||||
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
|
||||
ctx->prompt_base = prompt;
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
|
||||
return ::gpt_tokenize(ctx->vocab, text);
|
||||
}
|
||||
|
||||
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
|
||||
int n_past = 0;
|
||||
|
||||
std::vector<float> embd_w;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
|
||||
|
||||
int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
|
||||
|
||||
std::vector<gpt_vocab::id> embd = embd_inp;
|
||||
|
||||
size_t mem_per_token = 3000000;
|
||||
|
||||
std::string result;
|
||||
|
||||
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
||||
printf("gpt-2: failed to generate text\n");
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
n_past += embd.size();
|
||||
embd.clear();
|
||||
|
||||
{
|
||||
// sample next token
|
||||
const int top_k = ctx->top_k;
|
||||
const float top_p = ctx->top_p;
|
||||
const float temp = ctx->temp;
|
||||
|
||||
const int n_vocab = ctx->model.hparams.n_vocab;
|
||||
|
||||
const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
}
|
||||
|
||||
result += ctx->vocab.id_to_token[embd[0]];
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == 50256) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
struct gpt_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
};
|
||||
|
||||
struct gpt2_context;
|
||||
|
||||
struct gpt2_context * gpt2_init(const char * path_model);
|
||||
void gpt2_free(struct gpt2_context * ctx);
|
||||
|
||||
const char * gpt2_get_prompt(struct gpt2_context * ctx);
|
||||
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
|
||||
|
||||
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
|
||||
|
||||
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
|
@ -1,17 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage:
|
||||
# speak.sh <voice_id> <text-to-speak>
|
||||
|
||||
# espeak
|
||||
# Mac OS: brew install espeak
|
||||
# Linux: apt-get install espeak
|
||||
#
|
||||
espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
|
||||
|
||||
# Eleven Labs
|
||||
#
|
||||
#wd=$(dirname $0)
|
||||
#script=$wd/eleven-labs.py
|
||||
#python3 $script $1 "$2"
|
||||
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
|
@ -1,696 +0,0 @@
|
||||
// Talk with AI
|
||||
//
|
||||
|
||||
#include "whisper.h"
|
||||
#include "gpt-2.h"
|
||||
|
||||
#include <SDL.h>
|
||||
#include <SDL_audio.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t voice_ms = 10000;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool print_energy = false;
|
||||
bool no_timestamps = true;
|
||||
|
||||
std::string person = "Santa";
|
||||
std::string language = "en";
|
||||
std::string model_wsp = "models/ggml-base.en.bin";
|
||||
std::string model_gpt = "models/ggml-gpt-2-117M.bin";
|
||||
std::string speak = "./examples/talk/speak.sh";
|
||||
std::string fname_out = "";
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
||||
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
||||
else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
|
||||
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
||||
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
||||
fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
|
||||
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
//
|
||||
// SDL Audio capture
|
||||
//
|
||||
|
||||
class audio_async {
|
||||
public:
|
||||
audio_async(int len_ms);
|
||||
~audio_async();
|
||||
|
||||
bool init(int capture_id, int sample_rate);
|
||||
|
||||
// start capturing audio via the provided SDL callback
|
||||
// keep last len_ms seconds of audio in a circular buffer
|
||||
bool resume();
|
||||
bool pause();
|
||||
bool clear();
|
||||
|
||||
// callback to be called by SDL
|
||||
void callback(uint8_t * stream, int len);
|
||||
|
||||
// get audio data from the circular buffer
|
||||
void get(int ms, std::vector<float> & audio);
|
||||
|
||||
private:
|
||||
SDL_AudioDeviceID m_dev_id_in = 0;
|
||||
|
||||
int m_len_ms = 0;
|
||||
int m_sample_rate = 0;
|
||||
|
||||
bool m_running = false;
|
||||
std::mutex m_mutex;
|
||||
|
||||
std::vector<float> m_audio;
|
||||
std::vector<float> m_audio_new;
|
||||
size_t m_audio_pos = 0;
|
||||
size_t m_audio_len = 0;
|
||||
};
|
||||
|
||||
audio_async::audio_async(int len_ms) {
|
||||
m_len_ms = len_ms;
|
||||
}
|
||||
|
||||
audio_async::~audio_async() {
|
||||
if (m_dev_id_in) {
|
||||
SDL_CloseAudioDevice(m_dev_id_in);
|
||||
}
|
||||
}
|
||||
|
||||
bool audio_async::init(int capture_id, int sample_rate) {
|
||||
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
||||
|
||||
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
||||
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
||||
|
||||
{
|
||||
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
||||
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
||||
}
|
||||
}
|
||||
|
||||
SDL_AudioSpec capture_spec_requested;
|
||||
SDL_AudioSpec capture_spec_obtained;
|
||||
|
||||
SDL_zero(capture_spec_requested);
|
||||
SDL_zero(capture_spec_obtained);
|
||||
|
||||
capture_spec_requested.freq = sample_rate;
|
||||
capture_spec_requested.format = AUDIO_F32;
|
||||
capture_spec_requested.channels = 1;
|
||||
capture_spec_requested.samples = 1024;
|
||||
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
|
||||
audio_async * audio = (audio_async *) userdata;
|
||||
audio->callback(stream, len);
|
||||
};
|
||||
capture_spec_requested.userdata = this;
|
||||
|
||||
if (capture_id >= 0) {
|
||||
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
||||
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
||||
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
}
|
||||
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
||||
m_dev_id_in = 0;
|
||||
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
|
||||
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
||||
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
|
||||
capture_spec_requested.format);
|
||||
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
|
||||
capture_spec_requested.channels);
|
||||
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
m_sample_rate = capture_spec_obtained.freq;
|
||||
|
||||
m_audio.resize((m_sample_rate*m_len_ms)/1000);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::resume() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_running) {
|
||||
fprintf(stderr, "%s: already running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 0);
|
||||
|
||||
m_running = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::pause() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: already paused!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 1);
|
||||
|
||||
m_running = false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::clear() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
m_audio_pos = 0;
|
||||
m_audio_len = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// callback to be called by SDL
|
||||
void audio_async::callback(uint8_t * stream, int len) {
|
||||
if (!m_running) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n_samples = len / sizeof(float);
|
||||
|
||||
m_audio_new.resize(n_samples);
|
||||
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
|
||||
|
||||
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (m_audio_pos + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - m_audio_pos;
|
||||
|
||||
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
||||
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = m_audio.size();
|
||||
} else {
|
||||
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void audio_async::get(int ms, std::vector<float> & result) {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (ms <= 0) {
|
||||
ms = m_len_ms;
|
||||
}
|
||||
|
||||
size_t n_samples = (m_sample_rate * ms) / 1000;
|
||||
if (n_samples > m_audio_len) {
|
||||
n_samples = m_audio_len;
|
||||
}
|
||||
|
||||
result.resize(n_samples);
|
||||
|
||||
int s0 = m_audio_pos - n_samples;
|
||||
if (s0 < 0) {
|
||||
s0 += m_audio.size();
|
||||
}
|
||||
|
||||
if (s0 + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - s0;
|
||||
|
||||
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
|
||||
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
|
||||
} else {
|
||||
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
|
||||
std::string trim(const std::string & s) {
|
||||
std::regex e("^\\s+|\\s+$");
|
||||
return std::regex_replace(s, e, "");
|
||||
}
|
||||
|
||||
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
||||
std::string result = s;
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(from, pos)) != std::string::npos) {
|
||||
result.replace(pos, from.length(), to);
|
||||
pos += to.length();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||
const float dt = 1.0f / sample_rate;
|
||||
const float alpha = dt / (rc + dt);
|
||||
|
||||
float y = data[0];
|
||||
|
||||
for (size_t i = 1; i < data.size(); i++) {
|
||||
y = alpha * (y + data[i] - data[i - 1]);
|
||||
data[i] = y;
|
||||
}
|
||||
}
|
||||
|
||||
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||
const int n_samples = pcmf32.size();
|
||||
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||
|
||||
if (n_samples_last >= n_samples) {
|
||||
// not enough samples - assume no speech
|
||||
return false;
|
||||
}
|
||||
|
||||
if (freq_thold > 0.0f) {
|
||||
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||
}
|
||||
|
||||
float energy_all = 0.0f;
|
||||
float energy_last = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < n_samples; i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= n_samples - n_samples_last) {
|
||||
energy_last += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= n_samples;
|
||||
energy_last /= n_samples_last;
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||
}
|
||||
|
||||
if (energy_last > vad_thold*energy_all) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
prob = 0.0f;
|
||||
t_ms = 0;
|
||||
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
int prob_n = 0;
|
||||
std::string result;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
result += text;
|
||||
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const auto token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
prob += token.p;
|
||||
++prob_n;
|
||||
}
|
||||
}
|
||||
|
||||
if (prob_n > 0) {
|
||||
prob /= prob_n;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::string k_prompt =
|
||||
R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:
|
||||
|
||||
B: Hello {0}, how are you?
|
||||
A: I'm fine, thank you.
|
||||
{1}
|
||||
Here is how {0} (A) continues the dialogue:
|
||||
|
||||
A:)";
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
|
||||
struct whisper_context * ctx_wsp = whisper_init(params.model_wsp.c_str());
|
||||
|
||||
// gpt init
|
||||
|
||||
struct gpt2_context * ctx_gpt = gpt2_init(params.model_gpt.c_str());
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx_wsp)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__,
|
||||
params.n_threads,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
||||
// init audio
|
||||
|
||||
audio_async audio(30*1000);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
|
||||
int n_iter = 0;
|
||||
|
||||
bool is_running = true;
|
||||
bool force_speak = false;
|
||||
|
||||
float prob0 = 0.0f;
|
||||
float prob = 0.0f;
|
||||
|
||||
std::vector<float> pcmf32_cur;
|
||||
std::vector<float> pcmf32_prompt;
|
||||
|
||||
gpt2_set_prompt(ctx_gpt, "");
|
||||
|
||||
const int voice_id = rand()%6;
|
||||
|
||||
fprintf(stderr, "gpt-2: prompt:\n");
|
||||
fprintf(stderr, "========================\n\n");
|
||||
fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
|
||||
fprintf(stderr, "========================\n\n");
|
||||
|
||||
// main loop
|
||||
while (is_running) {
|
||||
// handle Ctrl + C
|
||||
{
|
||||
SDL_Event event;
|
||||
while (SDL_PollEvent(&event)) {
|
||||
switch (event.type) {
|
||||
case SDL_QUIT:
|
||||
{
|
||||
is_running = false;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// delay
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
int64_t t_ms = 0;
|
||||
|
||||
{
|
||||
audio.get(2000, pcmf32_cur);
|
||||
|
||||
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
|
||||
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||
|
||||
audio.get(params.voice_ms, pcmf32_cur);
|
||||
|
||||
std::string text_heard = "";
|
||||
|
||||
if (!force_speak) {
|
||||
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
|
||||
}
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\[.*?\\]");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\(.*?\\)");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
|
||||
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
|
||||
// take first line
|
||||
text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
|
||||
|
||||
// remove leading and trailing whitespace
|
||||
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
|
||||
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
|
||||
|
||||
const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
|
||||
|
||||
if (text_heard.empty() || tokens.empty() || force_speak) {
|
||||
fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
|
||||
audio.clear();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
force_speak = false;
|
||||
|
||||
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
|
||||
|
||||
std::string prompt_base = gpt2_get_prompt(ctx_gpt);
|
||||
|
||||
std::string text_to_speak;
|
||||
|
||||
{
|
||||
prompt_base += "B: " + text_heard + "\n";
|
||||
|
||||
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
||||
|
||||
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
|
||||
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
|
||||
|
||||
// remove first 2 lines of base prompt
|
||||
if (n_iter > 4) {
|
||||
{
|
||||
const size_t pos = prompt_base.find_first_of("\n");
|
||||
if (pos != std::string::npos) {
|
||||
prompt_base = prompt_base.substr(pos + 1);
|
||||
}
|
||||
}
|
||||
{
|
||||
const size_t pos = prompt_base.find_first_of("\n");
|
||||
if (pos != std::string::npos) {
|
||||
prompt_base = prompt_base.substr(pos + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prompt_base += "A:" + text_to_speak + "\n";
|
||||
|
||||
{
|
||||
prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
||||
|
||||
printf("===============\n");
|
||||
printf("prompt:\n");
|
||||
printf("%s\n", prompt.c_str());
|
||||
printf("===============\n");
|
||||
}
|
||||
}
|
||||
|
||||
//printf("========================\n");
|
||||
//printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
|
||||
//printf("========================\n");
|
||||
|
||||
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
|
||||
audio.clear();
|
||||
|
||||
++n_iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audio.pause();
|
||||
|
||||
whisper_print_timings(ctx_wsp);
|
||||
whisper_free(ctx_wsp);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,109 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Transcribe twitch.tv livestream by feeding audio input to whisper.cpp at regular intervals
|
||||
# Thanks to @keyehzy
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/209
|
||||
#
|
||||
# The script currently depends on the third-party tool "streamlink"
|
||||
# On Mac OS, you can install it via "brew install streamlink"
|
||||
#
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
step=10
|
||||
model=base.en
|
||||
threads=4
|
||||
|
||||
help()
|
||||
{
|
||||
echo "Example program for captioning a livestream from twitch.tv."
|
||||
echo
|
||||
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
||||
echo "options:"
|
||||
echo "-s Step in seconds (default is $step)."
|
||||
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
|
||||
echo "-t Number of threads to use."
|
||||
echo "-h Print this help page."
|
||||
echo
|
||||
}
|
||||
|
||||
check_requirements()
|
||||
{
|
||||
if ! command -v ./main &>/dev/null; then
|
||||
echo "whisper.cpp main executable is required (make)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v streamlink &>/dev/null; then
|
||||
echo "streamlink is required (https://streamlink.github.io)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v ffmpeg &>/dev/null; then
|
||||
echo "ffmpeg is required (https://ffmpeg.org)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_requirements
|
||||
|
||||
while getopts ":s:m:t:h" option; do
|
||||
case $option in
|
||||
s)
|
||||
step=$OPTARG;;
|
||||
m)
|
||||
model=$OPTARG;;
|
||||
t)
|
||||
threads=$OPTARG;;
|
||||
h)
|
||||
help
|
||||
exit;;
|
||||
\?)
|
||||
help
|
||||
exit;;
|
||||
esac
|
||||
done
|
||||
|
||||
url=${@:$OPTIND:1}
|
||||
|
||||
if [ -z $url ]; then
|
||||
help
|
||||
exit
|
||||
fi
|
||||
|
||||
echo "Piping from streamlink url=$url model=$model step=$step threads=$threads"
|
||||
streamlink $url best -O 2>/dev/null | ffmpeg -loglevel quiet -i - -y -probesize 32 -y -ar 16000 -ac 1 -acodec pcm_s16le /tmp/whisper-live0.wav &
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
printf "error: ffmpeg failed\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Buffering stream... (this should take $step seconds)"
|
||||
sleep $(($step))
|
||||
|
||||
set +e
|
||||
|
||||
echo "Starting..."
|
||||
|
||||
i=0
|
||||
SECONDS=0
|
||||
while true
|
||||
do
|
||||
err=1
|
||||
while [ $err -ne 0 ]; do
|
||||
if [ $i -gt 0 ]; then
|
||||
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step-1)).5 -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||
else
|
||||
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step)) -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
|
||||
fi
|
||||
err=$(cat /tmp/whisper-live.err | wc -l)
|
||||
done
|
||||
|
||||
./main -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
|
||||
|
||||
while [ $SECONDS -lt $((($i+1)*$step)) ]; do
|
||||
sleep 1
|
||||
done
|
||||
((i=i+1))
|
||||
done
|
15
examples/whisper.android/.gitignore
vendored
15
examples/whisper.android/.gitignore
vendored
@ -1,15 +0,0 @@
|
||||
*.iml
|
||||
.gradle
|
||||
/local.properties
|
||||
/.idea/caches
|
||||
/.idea/libraries
|
||||
/.idea/modules.xml
|
||||
/.idea/workspace.xml
|
||||
/.idea/navEditor.xml
|
||||
/.idea/assetWizardSettings.xml
|
||||
.DS_Store
|
||||
/build
|
||||
/captures
|
||||
.externalNativeBuild
|
||||
.cxx
|
||||
local.properties
|
3
examples/whisper.android/.idea/.gitignore
generated
vendored
3
examples/whisper.android/.idea/.gitignore
generated
vendored
@ -1,3 +0,0 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
1
examples/whisper.android/.idea/.name
generated
1
examples/whisper.android/.idea/.name
generated
@ -1 +0,0 @@
|
||||
WhisperCppDemo
|
6
examples/whisper.android/.idea/compiler.xml
generated
6
examples/whisper.android/.idea/compiler.xml
generated
@ -1,6 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CompilerConfiguration">
|
||||
<bytecodeTargetLevel target="11" />
|
||||
</component>
|
||||
</project>
|
18
examples/whisper.android/.idea/gradle.xml
generated
18
examples/whisper.android/.idea/gradle.xml
generated
@ -1,18 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="GradleSettings">
|
||||
<option name="linkedExternalProjectsSettings">
|
||||
<GradleProjectSettings>
|
||||
<option name="testRunner" value="GRADLE" />
|
||||
<option name="distributionType" value="DEFAULT_WRAPPED" />
|
||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||
<option name="modules">
|
||||
<set>
|
||||
<option value="$PROJECT_DIR$" />
|
||||
<option value="$PROJECT_DIR$/app" />
|
||||
</set>
|
||||
</option>
|
||||
</GradleProjectSettings>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
10
examples/whisper.android/.idea/misc.xml
generated
10
examples/whisper.android/.idea/misc.xml
generated
@ -1,10 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
|
||||
<output url="file://$PROJECT_DIR$/build/classes" />
|
||||
</component>
|
||||
<component name="ProjectType">
|
||||
<option name="id" value="Android" />
|
||||
</component>
|
||||
</project>
|
6
examples/whisper.android/.idea/vcs.xml
generated
6
examples/whisper.android/.idea/vcs.xml
generated
@ -1,6 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -1,12 +0,0 @@
|
||||
A sample Android app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
|
||||
|
||||
To use:
|
||||
|
||||
1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
|
||||
2. Copy the model to the "app/src/main/assets/models" folder.
|
||||
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
|
||||
4. Copy the sample to the "app/src/main/assets/samples" folder.
|
||||
5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
|
||||
[^1]: I recommend the tiny or base models for running on an Android device.
|
||||
|
||||
<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
|
1
examples/whisper.android/app/.gitignore
vendored
1
examples/whisper.android/app/.gitignore
vendored
@ -1 +0,0 @@
|
||||
/build
|
@ -1,76 +0,0 @@
|
||||
plugins {
|
||||
id 'com.android.application'
|
||||
id 'org.jetbrains.kotlin.android'
|
||||
}
|
||||
|
||||
android {
|
||||
namespace 'com.whispercppdemo'
|
||||
compileSdk 33
|
||||
|
||||
defaultConfig {
|
||||
applicationId "com.whispercppdemo"
|
||||
minSdk 26
|
||||
targetSdk 32
|
||||
versionCode 1
|
||||
versionName "1.0"
|
||||
|
||||
ndk {
|
||||
abiFilters 'arm64-v8a', 'x86_64'
|
||||
}
|
||||
|
||||
testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
|
||||
vectorDrawables {
|
||||
useSupportLibrary true
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
signingConfig signingConfigs.debug
|
||||
minifyEnabled true
|
||||
proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility JavaVersion.VERSION_1_8
|
||||
targetCompatibility JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = '1.8'
|
||||
}
|
||||
buildFeatures {
|
||||
compose true
|
||||
}
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion '1.3.1'
|
||||
}
|
||||
ndkVersion "25.0.8528842"
|
||||
externalNativeBuild {
|
||||
ndkBuild {
|
||||
path 'src/main/jni/whisper/Android.mk'
|
||||
}
|
||||
}
|
||||
packagingOptions {
|
||||
resources {
|
||||
excludes += '/META-INF/{AL2.0,LGPL2.1}'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation 'androidx.activity:activity-compose:1.6.1'
|
||||
implementation 'androidx.compose.material:material-icons-core:1.3.1'
|
||||
implementation 'androidx.compose.material3:material3:1.0.1'
|
||||
implementation "androidx.compose.ui:ui:1.3.2"
|
||||
implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
|
||||
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
|
||||
implementation "com.google.accompanist:accompanist-permissions:0.28.0"
|
||||
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
|
||||
|
||||
testImplementation 'junit:junit:4.13.2'
|
||||
androidTestImplementation 'androidx.test.ext:junit:1.1.4'
|
||||
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
|
||||
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
|
||||
debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
|
||||
debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
|
||||
}
|
21
examples/whisper.android/app/proguard-rules.pro
vendored
21
examples/whisper.android/app/proguard-rules.pro
vendored
@ -1,21 +0,0 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
@ -1,24 +0,0 @@
|
||||
package com.whispercppdemo
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("com.whispercppdemo", appContext.packageName)
|
||||
}
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:tools="http://schemas.android.com/tools">
|
||||
|
||||
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
||||
|
||||
<application
|
||||
android:allowBackup="true"
|
||||
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||
android:fullBackupContent="@xml/backup_rules"
|
||||
android:icon="@mipmap/ic_launcher"
|
||||
android:label="@string/app_name"
|
||||
android:supportsRtl="true"
|
||||
android:theme="@style/Theme.WhisperCppDemo"
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:exported="true"
|
||||
android:theme="@style/Theme.WhisperCppDemo">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
<category android:name="android.intent.category.LAUNCHER" />
|
||||
</intent-filter>
|
||||
|
||||
<meta-data
|
||||
android:name="android.app.lib_name"
|
||||
android:value="" />
|
||||
</activity>
|
||||
</application>
|
||||
|
||||
</manifest>
|
@ -1,22 +0,0 @@
|
||||
package com.whispercppdemo
|
||||
|
||||
import android.os.Bundle
|
||||
import androidx.activity.ComponentActivity
|
||||
import androidx.activity.compose.setContent
|
||||
import androidx.activity.viewModels
|
||||
import com.whispercppdemo.ui.main.MainScreen
|
||||
import com.whispercppdemo.ui.main.MainScreenViewModel
|
||||
import com.whispercppdemo.ui.theme.WhisperCppDemoTheme
|
||||
|
||||
class MainActivity : ComponentActivity() {
|
||||
private val viewModel: MainScreenViewModel by viewModels { MainScreenViewModel.factory() }
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
setContent {
|
||||
WhisperCppDemoTheme {
|
||||
MainScreen(viewModel)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
package com.whispercppdemo.media
|
||||
|
||||
import java.io.ByteArrayOutputStream
|
||||
import java.io.File
|
||||
import java.nio.ByteBuffer
|
||||
import java.nio.ByteOrder
|
||||
|
||||
fun decodeWaveFile(file: File): FloatArray {
|
||||
val baos = ByteArrayOutputStream()
|
||||
file.inputStream().use { it.copyTo(baos) }
|
||||
val buffer = ByteBuffer.wrap(baos.toByteArray())
|
||||
buffer.order(ByteOrder.LITTLE_ENDIAN)
|
||||
buffer.position(44)
|
||||
val shortBuffer = buffer.asShortBuffer()
|
||||
val shortArray = ShortArray(shortBuffer.limit())
|
||||
shortBuffer.get(shortArray)
|
||||
return FloatArray(shortArray.size) { index ->
|
||||
(shortArray[index] / 32767.0f).coerceIn(-1f..1f)
|
||||
}
|
||||
}
|
||||
|
||||
fun encodeWaveFile(file: File, data: ShortArray) {
|
||||
file.outputStream().use {
|
||||
it.write(headerBytes(data.size * 2))
|
||||
val buffer = ByteBuffer.allocate(data.size * 2)
|
||||
buffer.order(ByteOrder.LITTLE_ENDIAN)
|
||||
buffer.asShortBuffer().put(data)
|
||||
val bytes = ByteArray(buffer.limit())
|
||||
buffer.get(bytes)
|
||||
it.write(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
private fun headerBytes(totalLength: Int): ByteArray {
|
||||
require(totalLength >= 44)
|
||||
ByteBuffer.allocate(44).apply {
|
||||
order(ByteOrder.LITTLE_ENDIAN)
|
||||
|
||||
put('R'.code.toByte())
|
||||
put('I'.code.toByte())
|
||||
put('F'.code.toByte())
|
||||
put('F'.code.toByte())
|
||||
|
||||
putInt(totalLength - 8)
|
||||
|
||||
put('W'.code.toByte())
|
||||
put('A'.code.toByte())
|
||||
put('V'.code.toByte())
|
||||
put('E'.code.toByte())
|
||||
|
||||
put('f'.code.toByte())
|
||||
put('m'.code.toByte())
|
||||
put('t'.code.toByte())
|
||||
put(' '.code.toByte())
|
||||
|
||||
putInt(16)
|
||||
putShort(1.toShort())
|
||||
putShort(1.toShort())
|
||||
putInt(16000)
|
||||
putInt(32000)
|
||||
putShort(2.toShort())
|
||||
putShort(16.toShort())
|
||||
|
||||
put('d'.code.toByte())
|
||||
put('a'.code.toByte())
|
||||
put('t'.code.toByte())
|
||||
put('a'.code.toByte())
|
||||
|
||||
putInt(totalLength - 44)
|
||||
position(0)
|
||||
}.also {
|
||||
val bytes = ByteArray(it.limit())
|
||||
it.get(bytes)
|
||||
return bytes
|
||||
}
|
||||
}
|
@ -1,88 +0,0 @@
|
||||
package com.whispercppdemo.recorder
|
||||
|
||||
import android.annotation.SuppressLint
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import com.whispercppdemo.media.encodeWaveFile
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.asCoroutineDispatcher
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import java.util.concurrent.Executors
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
|
||||
class Recorder {
|
||||
private val scope: CoroutineScope = CoroutineScope(
|
||||
Executors.newSingleThreadExecutor().asCoroutineDispatcher()
|
||||
)
|
||||
private var recorder: AudioRecordThread? = null
|
||||
|
||||
suspend fun startRecording(outputFile: File, onError: (Exception) -> Unit) = withContext(scope.coroutineContext) {
|
||||
recorder = AudioRecordThread(outputFile, onError)
|
||||
recorder?.start()
|
||||
}
|
||||
|
||||
suspend fun stopRecording() = withContext(scope.coroutineContext) {
|
||||
recorder?.stopRecording()
|
||||
@Suppress("BlockingMethodInNonBlockingContext")
|
||||
recorder?.join()
|
||||
recorder = null
|
||||
}
|
||||
}
|
||||
|
||||
private class AudioRecordThread(
|
||||
private val outputFile: File,
|
||||
private val onError: (Exception) -> Unit
|
||||
) :
|
||||
Thread("AudioRecorder") {
|
||||
private var quit = AtomicBoolean(false)
|
||||
|
||||
@SuppressLint("MissingPermission")
|
||||
override fun run() {
|
||||
try {
|
||||
val bufferSize = AudioRecord.getMinBufferSize(
|
||||
16000,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT
|
||||
) * 4
|
||||
val buffer = ShortArray(bufferSize / 2)
|
||||
|
||||
val audioRecord = AudioRecord(
|
||||
MediaRecorder.AudioSource.MIC,
|
||||
16000,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
bufferSize
|
||||
)
|
||||
|
||||
try {
|
||||
audioRecord.startRecording()
|
||||
|
||||
val allData = mutableListOf<Short>()
|
||||
|
||||
while (!quit.get()) {
|
||||
val read = audioRecord.read(buffer, 0, buffer.size)
|
||||
if (read > 0) {
|
||||
for (i in 0 until read) {
|
||||
allData.add(buffer[i])
|
||||
}
|
||||
} else {
|
||||
throw java.lang.RuntimeException("audioRecord.read returned $read")
|
||||
}
|
||||
}
|
||||
|
||||
audioRecord.stop()
|
||||
encodeWaveFile(outputFile, allData.toShortArray())
|
||||
} finally {
|
||||
audioRecord.release()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
onError(e)
|
||||
}
|
||||
}
|
||||
|
||||
fun stopRecording() {
|
||||
quit.set(true)
|
||||
}
|
||||
}
|
@ -1,99 +0,0 @@
|
||||
package com.whispercppdemo.ui.main
|
||||
|
||||
import androidx.compose.foundation.layout.*
|
||||
import androidx.compose.foundation.rememberScrollState
|
||||
import androidx.compose.foundation.verticalScroll
|
||||
import androidx.compose.material3.*
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.res.stringResource
|
||||
import androidx.compose.ui.unit.dp
|
||||
import com.google.accompanist.permissions.ExperimentalPermissionsApi
|
||||
import com.google.accompanist.permissions.isGranted
|
||||
import com.google.accompanist.permissions.rememberPermissionState
|
||||
import com.whispercppdemo.R
|
||||
|
||||
@Composable
|
||||
fun MainScreen(viewModel: MainScreenViewModel) {
|
||||
MainScreen(
|
||||
canTranscribe = viewModel.canTranscribe,
|
||||
isRecording = viewModel.isRecording,
|
||||
messageLog = viewModel.dataLog,
|
||||
onTranscribeSampleTapped = viewModel::transcribeSample,
|
||||
onRecordTapped = viewModel::toggleRecord
|
||||
)
|
||||
}
|
||||
|
||||
@OptIn(ExperimentalMaterial3Api::class)
|
||||
@Composable
|
||||
private fun MainScreen(
|
||||
canTranscribe: Boolean,
|
||||
isRecording: Boolean,
|
||||
messageLog: String,
|
||||
onTranscribeSampleTapped: () -> Unit,
|
||||
onRecordTapped: () -> Unit
|
||||
) {
|
||||
Scaffold(
|
||||
topBar = {
|
||||
TopAppBar(
|
||||
title = { Text(stringResource(R.string.app_name)) }
|
||||
)
|
||||
},
|
||||
) { innerPadding ->
|
||||
Column(
|
||||
modifier = Modifier
|
||||
.padding(innerPadding)
|
||||
.padding(16.dp)
|
||||
) {
|
||||
Row(horizontalArrangement = Arrangement.SpaceBetween) {
|
||||
TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
|
||||
RecordButton(
|
||||
enabled = canTranscribe,
|
||||
isRecording = isRecording,
|
||||
onClick = onRecordTapped
|
||||
)
|
||||
}
|
||||
MessageLog(messageLog)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Composable
|
||||
private fun MessageLog(log: String) {
|
||||
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
|
||||
}
|
||||
|
||||
@Composable
|
||||
private fun TranscribeSampleButton(enabled: Boolean, onClick: () -> Unit) {
|
||||
Button(onClick = onClick, enabled = enabled) {
|
||||
Text("Transcribe sample")
|
||||
}
|
||||
}
|
||||
|
||||
@OptIn(ExperimentalPermissionsApi::class)
|
||||
@Composable
|
||||
private fun RecordButton(enabled: Boolean, isRecording: Boolean, onClick: () -> Unit) {
|
||||
val micPermissionState = rememberPermissionState(
|
||||
permission = android.Manifest.permission.RECORD_AUDIO,
|
||||
onPermissionResult = { granted ->
|
||||
if (granted) {
|
||||
onClick()
|
||||
}
|
||||
}
|
||||
)
|
||||
Button(onClick = {
|
||||
if (micPermissionState.status.isGranted) {
|
||||
onClick()
|
||||
} else {
|
||||
micPermissionState.launchPermissionRequest()
|
||||
}
|
||||
}, enabled = enabled) {
|
||||
Text(
|
||||
if (isRecording) {
|
||||
"Stop recording"
|
||||
} else {
|
||||
"Start recording"
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
@ -1,193 +0,0 @@
|
||||
package com.whispercppdemo.ui.main
|
||||
|
||||
import android.app.Application
|
||||
import android.content.Context
|
||||
import android.media.MediaPlayer
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.core.net.toUri
|
||||
import androidx.lifecycle.ViewModel
|
||||
import androidx.lifecycle.ViewModelProvider
|
||||
import androidx.lifecycle.viewModelScope
|
||||
import androidx.lifecycle.viewmodel.initializer
|
||||
import androidx.lifecycle.viewmodel.viewModelFactory
|
||||
import com.whispercppdemo.media.decodeWaveFile
|
||||
import com.whispercppdemo.recorder.Recorder
|
||||
import com.whispercppdemo.whisper.WhisperContext
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
|
||||
private const val LOG_TAG = "MainScreenViewModel"
|
||||
|
||||
class MainScreenViewModel(private val application: Application) : ViewModel() {
|
||||
var canTranscribe by mutableStateOf(false)
|
||||
private set
|
||||
var dataLog by mutableStateOf("")
|
||||
private set
|
||||
var isRecording by mutableStateOf(false)
|
||||
private set
|
||||
|
||||
private val modelsPath = File(application.filesDir, "models")
|
||||
private val samplesPath = File(application.filesDir, "samples")
|
||||
private var recorder: Recorder = Recorder()
|
||||
private var whisperContext: WhisperContext? = null
|
||||
private var mediaPlayer: MediaPlayer? = null
|
||||
private var recordedFile: File? = null
|
||||
|
||||
init {
|
||||
viewModelScope.launch {
|
||||
loadData()
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun loadData() {
|
||||
printMessage("Loading data...\n")
|
||||
try {
|
||||
copyAssets()
|
||||
loadBaseModel()
|
||||
canTranscribe = true
|
||||
} catch (e: Exception) {
|
||||
Log.w(LOG_TAG, e)
|
||||
printMessage("${e.localizedMessage}\n")
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun printMessage(msg: String) = withContext(Dispatchers.Main) {
|
||||
dataLog += msg
|
||||
}
|
||||
|
||||
private suspend fun copyAssets() = withContext(Dispatchers.IO) {
|
||||
modelsPath.mkdirs()
|
||||
samplesPath.mkdirs()
|
||||
application.copyData("models", modelsPath, ::printMessage)
|
||||
application.copyData("samples", samplesPath, ::printMessage)
|
||||
printMessage("All data copied to working directory.\n")
|
||||
}
|
||||
|
||||
private suspend fun loadBaseModel() = withContext(Dispatchers.IO) {
|
||||
printMessage("Loading model...\n")
|
||||
val firstModel = modelsPath.listFiles()!!.first()
|
||||
whisperContext = WhisperContext.createContext(firstModel.absolutePath)
|
||||
printMessage("Loaded model ${firstModel.name}.\n")
|
||||
}
|
||||
|
||||
fun transcribeSample() = viewModelScope.launch {
|
||||
transcribeAudio(getFirstSample())
|
||||
}
|
||||
|
||||
private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
|
||||
samplesPath.listFiles()!!.first()
|
||||
}
|
||||
|
||||
private suspend fun readAudioSamples(file: File): FloatArray = withContext(Dispatchers.IO) {
|
||||
stopPlayback()
|
||||
startPlayback(file)
|
||||
return@withContext decodeWaveFile(file)
|
||||
}
|
||||
|
||||
private suspend fun stopPlayback() = withContext(Dispatchers.Main) {
|
||||
mediaPlayer?.stop()
|
||||
mediaPlayer?.release()
|
||||
mediaPlayer = null
|
||||
}
|
||||
|
||||
private suspend fun startPlayback(file: File) = withContext(Dispatchers.Main) {
|
||||
mediaPlayer = MediaPlayer.create(application, file.absolutePath.toUri())
|
||||
mediaPlayer?.start()
|
||||
}
|
||||
|
||||
private suspend fun transcribeAudio(file: File) {
|
||||
if (!canTranscribe) {
|
||||
return
|
||||
}
|
||||
|
||||
canTranscribe = false
|
||||
|
||||
try {
|
||||
printMessage("Reading wave samples...\n")
|
||||
val data = readAudioSamples(file)
|
||||
printMessage("Transcribing data...\n")
|
||||
val text = whisperContext?.transcribeData(data)
|
||||
printMessage("Done: $text\n")
|
||||
} catch (e: Exception) {
|
||||
Log.w(LOG_TAG, e)
|
||||
printMessage("${e.localizedMessage}\n")
|
||||
}
|
||||
|
||||
canTranscribe = true
|
||||
}
|
||||
|
||||
fun toggleRecord() = viewModelScope.launch {
|
||||
try {
|
||||
if (isRecording) {
|
||||
recorder.stopRecording()
|
||||
isRecording = false
|
||||
recordedFile?.let { transcribeAudio(it) }
|
||||
} else {
|
||||
stopPlayback()
|
||||
val file = getTempFileForRecording()
|
||||
recorder.startRecording(file) { e ->
|
||||
viewModelScope.launch {
|
||||
withContext(Dispatchers.Main) {
|
||||
printMessage("${e.localizedMessage}\n")
|
||||
isRecording = false
|
||||
}
|
||||
}
|
||||
}
|
||||
isRecording = true
|
||||
recordedFile = file
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(LOG_TAG, e)
|
||||
printMessage("${e.localizedMessage}\n")
|
||||
isRecording = false
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun getTempFileForRecording() = withContext(Dispatchers.IO) {
|
||||
File.createTempFile("recording", "wav")
|
||||
}
|
||||
|
||||
override fun onCleared() {
|
||||
runBlocking {
|
||||
whisperContext?.release()
|
||||
whisperContext = null
|
||||
stopPlayback()
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
fun factory() = viewModelFactory {
|
||||
initializer {
|
||||
val application =
|
||||
this[ViewModelProvider.AndroidViewModelFactory.APPLICATION_KEY] as Application
|
||||
MainScreenViewModel(application)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun Context.copyData(
|
||||
assetDirName: String,
|
||||
destDir: File,
|
||||
printMessage: suspend (String) -> Unit
|
||||
) = withContext(Dispatchers.IO) {
|
||||
assets.list(assetDirName)?.forEach { name ->
|
||||
val assetPath = "$assetDirName/$name"
|
||||
Log.v(LOG_TAG, "Processing $assetPath...")
|
||||
val destination = File(destDir, name)
|
||||
Log.v(LOG_TAG, "Copying $assetPath to $destination...")
|
||||
printMessage("Copying $name...\n")
|
||||
assets.open(assetPath).use { input ->
|
||||
destination.outputStream().use { output ->
|
||||
input.copyTo(output)
|
||||
}
|
||||
}
|
||||
Log.v(LOG_TAG, "Copied $assetPath to $destination")
|
||||
}
|
||||
}
|
@ -1,11 +0,0 @@
|
||||
package com.whispercppdemo.ui.theme
|
||||
|
||||
import androidx.compose.ui.graphics.Color
|
||||
|
||||
val Purple80 = Color(0xFFD0BCFF)
|
||||
val PurpleGrey80 = Color(0xFFCCC2DC)
|
||||
val Pink80 = Color(0xFFEFB8C8)
|
||||
|
||||
val Purple40 = Color(0xFF6650a4)
|
||||
val PurpleGrey40 = Color(0xFF625b71)
|
||||
val Pink40 = Color(0xFF7D5260)
|
@ -1,68 +0,0 @@
|
||||
package com.whispercppdemo.ui.theme
|
||||
|
||||
import android.app.Activity
|
||||
import android.os.Build
|
||||
import androidx.compose.foundation.isSystemInDarkTheme
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.darkColorScheme
|
||||
import androidx.compose.material3.dynamicDarkColorScheme
|
||||
import androidx.compose.material3.dynamicLightColorScheme
|
||||
import androidx.compose.material3.lightColorScheme
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.SideEffect
|
||||
import androidx.compose.ui.graphics.toArgb
|
||||
import androidx.compose.ui.platform.LocalContext
|
||||
import androidx.compose.ui.platform.LocalView
|
||||
import androidx.core.view.ViewCompat
|
||||
|
||||
private val DarkColorScheme = darkColorScheme(
|
||||
primary = Purple80,
|
||||
secondary = PurpleGrey80,
|
||||
tertiary = Pink80
|
||||
)
|
||||
|
||||
private val LightColorScheme = lightColorScheme(
|
||||
primary = Purple40,
|
||||
secondary = PurpleGrey40,
|
||||
tertiary = Pink40
|
||||
|
||||
/* Other default colors to override
|
||||
background = Color(0xFFFFFBFE),
|
||||
surface = Color(0xFFFFFBFE),
|
||||
onPrimary = Color.White,
|
||||
onSecondary = Color.White,
|
||||
onTertiary = Color.White,
|
||||
onBackground = Color(0xFF1C1B1F),
|
||||
onSurface = Color(0xFF1C1B1F),
|
||||
*/
|
||||
)
|
||||
|
||||
@Composable
|
||||
fun WhisperCppDemoTheme(
|
||||
darkTheme: Boolean = isSystemInDarkTheme(),
|
||||
// Dynamic color is available on Android 12+
|
||||
dynamicColor: Boolean = true,
|
||||
content: @Composable () -> Unit
|
||||
) {
|
||||
val colorScheme = when {
|
||||
dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
|
||||
val context = LocalContext.current
|
||||
if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
|
||||
}
|
||||
darkTheme -> DarkColorScheme
|
||||
else -> LightColorScheme
|
||||
}
|
||||
val view = LocalView.current
|
||||
if (!view.isInEditMode) {
|
||||
SideEffect {
|
||||
(view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
|
||||
ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
|
||||
}
|
||||
}
|
||||
|
||||
MaterialTheme(
|
||||
colorScheme = colorScheme,
|
||||
typography = Typography,
|
||||
content = content
|
||||
)
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
package com.whispercppdemo.ui.theme
|
||||
|
||||
import androidx.compose.material3.Typography
|
||||
import androidx.compose.ui.text.TextStyle
|
||||
import androidx.compose.ui.text.font.FontFamily
|
||||
import androidx.compose.ui.text.font.FontWeight
|
||||
import androidx.compose.ui.unit.sp
|
||||
|
||||
// Set of Material typography styles to start with
|
||||
val Typography = Typography(
|
||||
bodyLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 16.sp,
|
||||
lineHeight = 24.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
/* Other default text styles to override
|
||||
titleLarge = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Normal,
|
||||
fontSize = 22.sp,
|
||||
lineHeight = 28.sp,
|
||||
letterSpacing = 0.sp
|
||||
),
|
||||
labelSmall = TextStyle(
|
||||
fontFamily = FontFamily.Default,
|
||||
fontWeight = FontWeight.Medium,
|
||||
fontSize = 11.sp,
|
||||
lineHeight = 16.sp,
|
||||
letterSpacing = 0.5.sp
|
||||
)
|
||||
*/
|
||||
)
|
@ -1,61 +0,0 @@
|
||||
package com.whispercppdemo.whisper
|
||||
|
||||
import kotlinx.coroutines.*
|
||||
import java.util.concurrent.Executors
|
||||
|
||||
class WhisperContext private constructor(private var ptr: Long) {
|
||||
// Meet Whisper C++ constraint: Don't access from more than one thread at a time.
|
||||
private val scope: CoroutineScope = CoroutineScope(
|
||||
Executors.newSingleThreadExecutor().asCoroutineDispatcher()
|
||||
)
|
||||
|
||||
suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
|
||||
require(ptr != 0L)
|
||||
WhisperLib.fullTranscribe(ptr, data)
|
||||
val textCount = WhisperLib.getTextSegmentCount(ptr)
|
||||
return@withContext buildString {
|
||||
for (i in 0 until textCount) {
|
||||
append(WhisperLib.getTextSegment(ptr, i))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun release() = withContext(scope.coroutineContext) {
|
||||
if (ptr != 0L) {
|
||||
WhisperLib.freeContext(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
runBlocking {
|
||||
release()
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
fun createContext(filePath: String): WhisperContext {
|
||||
val ptr = WhisperLib.initContext(filePath)
|
||||
if (ptr == 0L) {
|
||||
throw java.lang.RuntimeException("Couldn't create context with path $filePath")
|
||||
}
|
||||
return WhisperContext(ptr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class WhisperLib {
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("whisper")
|
||||
}
|
||||
|
||||
// JNI methods
|
||||
external fun initContext(modelPath: String): Long
|
||||
external fun freeContext(contextPtr: Long)
|
||||
external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
|
||||
external fun getTextSegmentCount(contextPtr: Long): Int
|
||||
external fun getTextSegment(contextPtr: Long, index: Int): String
|
||||
}
|
||||
}
|
||||
|
@ -1,22 +0,0 @@
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
include $(CLEAR_VARS)
|
||||
WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
|
||||
LOCAL_LDLIBS := -llog
|
||||
LOCAL_MODULE := libwhisper
|
||||
|
||||
# Make the final output library smaller by only keeping the symbols referenced from the app.
|
||||
ifneq ($(APP_OPTIM),debug)
|
||||
LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
|
||||
LOCAL_CFLAGS += -ffunction-sections -fdata-sections
|
||||
LOCAL_LDFLAGS += -Wl,--gc-sections
|
||||
LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
|
||||
LOCAL_LDFLAGS += -flto
|
||||
endif
|
||||
|
||||
LOCAL_CFLAGS += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
|
||||
LOCAL_CPPFLAGS += -std=c++11
|
||||
LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
|
||||
$(WHISPER_LIB_DIR)/whisper.cpp \
|
||||
$(LOCAL_PATH)/jni.c
|
||||
|
||||
include $(BUILD_SHARED_LIBRARY)
|
@ -1 +0,0 @@
|
||||
APP_STL := c++_static
|
@ -1,93 +0,0 @@
|
||||
#include <jni.h>
|
||||
#include <android/log.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#include "whisper.h"
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define TAG "JNI"
|
||||
|
||||
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
|
||||
|
||||
static inline int min(int a, int b) {
|
||||
return (a < b) ? a : b;
|
||||
}
|
||||
|
||||
static inline int max(int a, int b) {
|
||||
return (a > b) ? a : b;
|
||||
}
|
||||
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
|
||||
JNIEnv *env, jobject thiz, jstring model_path_str) {
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = NULL;
|
||||
const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
|
||||
context = whisper_init(model_path_chars);
|
||||
(*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
|
||||
return (jlong) context;
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr) {
|
||||
UNUSED(env);
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||
whisper_free(context);
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||
jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
|
||||
const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
|
||||
|
||||
// Leave 2 processors free (i.e. the high-efficiency cores).
|
||||
int max_threads = max(1, min(8, get_nprocs() - 2));
|
||||
LOGI("Selecting %d threads", max_threads);
|
||||
|
||||
// The below adapted from the Objective-C iOS sample
|
||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = false;
|
||||
params.language = "en";
|
||||
params.n_threads = max_threads;
|
||||
params.offset_ms = 0;
|
||||
params.no_context = true;
|
||||
params.single_segment = false;
|
||||
|
||||
whisper_reset_timings(context);
|
||||
|
||||
LOGI("About to run whisper_full");
|
||||
if (whisper_full(context, params, audio_data_arr, audio_data_length) != 0) {
|
||||
LOGI("Failed to run the model");
|
||||
} else {
|
||||
whisper_print_timings(context);
|
||||
}
|
||||
(*env)->ReleaseFloatArrayElements(env, audio_data, audio_data_arr, JNI_ABORT);
|
||||
}
|
||||
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegmentCount(
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr) {
|
||||
UNUSED(env);
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||
return whisper_full_n_segments(context);
|
||||
}
|
||||
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||
const char *text = whisper_full_get_segment_text(context, index);
|
||||
jstring string = (*env)->NewStringUTF(env, text);
|
||||
return string;
|
||||
}
|
@ -1,170 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path
|
||||
android:fillColor="#3DDC84"
|
||||
android:pathData="M0,0h108v108h-108z" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M9,0L9,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,0L19,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,0L29,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,0L39,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,0L49,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,0L59,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,0L69,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,0L79,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M89,0L89,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M99,0L99,108"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,9L108,9"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,19L108,19"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,29L108,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,39L108,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,49L108,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,59L108,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,69L108,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,79L108,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,89L108,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M0,99L108,99"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,29L89,29"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,39L89,39"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,49L89,49"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,59L89,59"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,69L89,69"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M19,79L89,79"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M29,19L29,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M39,19L39,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M49,19L49,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M59,19L59,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M69,19L69,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
<path
|
||||
android:fillColor="#00000000"
|
||||
android:pathData="M79,19L79,89"
|
||||
android:strokeWidth="0.8"
|
||||
android:strokeColor="#33FFFFFF" />
|
||||
</vector>
|
@ -1,30 +0,0 @@
|
||||
<vector xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:aapt="http://schemas.android.com/aapt"
|
||||
android:width="108dp"
|
||||
android:height="108dp"
|
||||
android:viewportWidth="108"
|
||||
android:viewportHeight="108">
|
||||
<path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
|
||||
<aapt:attr name="android:fillColor">
|
||||
<gradient
|
||||
android:endX="85.84757"
|
||||
android:endY="92.4963"
|
||||
android:startX="42.9492"
|
||||
android:startY="49.59793"
|
||||
android:type="linear">
|
||||
<item
|
||||
android:color="#44000000"
|
||||
android:offset="0.0" />
|
||||
<item
|
||||
android:color="#00000000"
|
||||
android:offset="1.0" />
|
||||
</gradient>
|
||||
</aapt:attr>
|
||||
</path>
|
||||
<path
|
||||
android:fillColor="#FFFFFF"
|
||||
android:fillType="nonZero"
|
||||
android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
|
||||
android:strokeWidth="1"
|
||||
android:strokeColor="#00000000" />
|
||||
</vector>
|
@ -1,5 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
<background android:drawable="@drawable/ic_launcher_background" />
|
||||
<foreground android:drawable="@drawable/ic_launcher_foreground" />
|
||||
</adaptive-icon>
|
@ -1,10 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
<color name="purple_200">#FFBB86FC</color>
|
||||
<color name="purple_500">#FF6200EE</color>
|
||||
<color name="purple_700">#FF3700B3</color>
|
||||
<color name="teal_200">#FF03DAC5</color>
|
||||
<color name="teal_700">#FF018786</color>
|
||||
<color name="black">#FF000000</color>
|
||||
<color name="white">#FFFFFFFF</color>
|
||||
</resources>
|
@ -1,3 +0,0 @@
|
||||
<resources>
|
||||
<string name="app_name">WhisperCppDemo</string>
|
||||
</resources>
|
@ -1,5 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
|
||||
<style name="Theme.WhisperCppDemo" parent="android:Theme.Material.Light.NoActionBar" />
|
||||
</resources>
|
@ -1,13 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample backup rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/guide/topics/data/autobackup
|
||||
for details.
|
||||
Note: This file is ignored for devices older that API 31
|
||||
See https://developer.android.com/about/versions/12/backup-restore
|
||||
-->
|
||||
<full-backup-content>
|
||||
<!--
|
||||
<include domain="sharedpref" path="."/>
|
||||
<exclude domain="sharedpref" path="device.xml"/>
|
||||
-->
|
||||
</full-backup-content>
|
@ -1,19 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><!--
|
||||
Sample data extraction rules file; uncomment and customize as necessary.
|
||||
See https://developer.android.com/about/versions/12/backup-restore#xml-changes
|
||||
for details.
|
||||
-->
|
||||
<data-extraction-rules>
|
||||
<cloud-backup>
|
||||
<!-- TODO: Use <include> and <exclude> to control what is backed up.
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
-->
|
||||
</cloud-backup>
|
||||
<!--
|
||||
<device-transfer>
|
||||
<include .../>
|
||||
<exclude .../>
|
||||
</device-transfer>
|
||||
-->
|
||||
</data-extraction-rules>
|
@ -1,17 +0,0 @@
|
||||
package com.whispercppdemo
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||
plugins {
|
||||
id 'com.android.application' version '7.3.1' apply false
|
||||
id 'com.android.library' version '7.3.1' apply false
|
||||
id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
|
||||
}
|
@ -1,23 +0,0 @@
|
||||
# Project-wide Gradle settings.
|
||||
# IDE (e.g. Android Studio) users:
|
||||
# Gradle settings configured through the IDE *will override*
|
||||
# any settings specified in this file.
|
||||
# For more details on how to configure your build environment visit
|
||||
# http://www.gradle.org/docs/current/userguide/build_environment.html
|
||||
# Specifies the JVM arguments used for the daemon process.
|
||||
# The setting is particularly useful for tweaking memory settings.
|
||||
org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
|
||||
# When configured, Gradle will run in incubating parallel mode.
|
||||
# This option should only be used with decoupled projects. More details, visit
|
||||
# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
|
||||
# org.gradle.parallel=true
|
||||
# AndroidX package structure to make it clearer which packages are bundled with the
|
||||
# Android operating system, and which are packaged with your app's APK
|
||||
# https://developer.android.com/topic/libraries/support-library/androidx-rn
|
||||
android.useAndroidX=true
|
||||
# Kotlin code style for this project: "official" or "obsolete":
|
||||
kotlin.code.style=official
|
||||
# Enables namespacing of each library's R class so that its R class includes only the
|
||||
# resources declared in the library itself and none from the library's dependencies,
|
||||
# thereby reducing the size of the R class for that library
|
||||
android.nonTransitiveRClass=true
|
Binary file not shown.
@ -1,6 +0,0 @@
|
||||
#Wed Dec 14 10:37:24 EST 2022
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
|
||||
distributionPath=wrapper/dists
|
||||
zipStorePath=wrapper/dists
|
||||
zipStoreBase=GRADLE_USER_HOME
|
185
examples/whisper.android/gradlew
vendored
185
examples/whisper.android/gradlew
vendored
@ -1,185 +0,0 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
#
|
||||
# Copyright 2015 the original author or authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Gradle start up script for UN*X
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
# Attempt to set APP_HOME
|
||||
# Resolve links: $0 may be a link
|
||||
PRG="$0"
|
||||
# Need this for relative symlinks.
|
||||
while [ -h "$PRG" ] ; do
|
||||
ls=`ls -ld "$PRG"`
|
||||
link=`expr "$ls" : '.*-> \(.*\)$'`
|
||||
if expr "$link" : '/.*' > /dev/null; then
|
||||
PRG="$link"
|
||||
else
|
||||
PRG=`dirname "$PRG"`"/$link"
|
||||
fi
|
||||
done
|
||||
SAVED="`pwd`"
|
||||
cd "`dirname \"$PRG\"`/" >/dev/null
|
||||
APP_HOME="`pwd -P`"
|
||||
cd "$SAVED" >/dev/null
|
||||
|
||||
APP_NAME="Gradle"
|
||||
APP_BASE_NAME=`basename "$0"`
|
||||
|
||||
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
|
||||
|
||||
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
||||
MAX_FD="maximum"
|
||||
|
||||
warn () {
|
||||
echo "$*"
|
||||
}
|
||||
|
||||
die () {
|
||||
echo
|
||||
echo "$*"
|
||||
echo
|
||||
exit 1
|
||||
}
|
||||
|
||||
# OS specific support (must be 'true' or 'false').
|
||||
cygwin=false
|
||||
msys=false
|
||||
darwin=false
|
||||
nonstop=false
|
||||
case "`uname`" in
|
||||
CYGWIN* )
|
||||
cygwin=true
|
||||
;;
|
||||
Darwin* )
|
||||
darwin=true
|
||||
;;
|
||||
MINGW* )
|
||||
msys=true
|
||||
;;
|
||||
NONSTOP* )
|
||||
nonstop=true
|
||||
;;
|
||||
esac
|
||||
|
||||
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
||||
|
||||
|
||||
# Determine the Java command to use to start the JVM.
|
||||
if [ -n "$JAVA_HOME" ] ; then
|
||||
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||
# IBM's JDK on AIX uses strange locations for the executables
|
||||
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||
else
|
||||
JAVACMD="$JAVA_HOME/bin/java"
|
||||
fi
|
||||
if [ ! -x "$JAVACMD" ] ; then
|
||||
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
else
|
||||
JAVACMD="java"
|
||||
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
|
||||
Please set the JAVA_HOME variable in your environment to match the
|
||||
location of your Java installation."
|
||||
fi
|
||||
|
||||
# Increase the maximum file descriptors if we can.
|
||||
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
||||
MAX_FD_LIMIT=`ulimit -H -n`
|
||||
if [ $? -eq 0 ] ; then
|
||||
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
||||
MAX_FD="$MAX_FD_LIMIT"
|
||||
fi
|
||||
ulimit -n $MAX_FD
|
||||
if [ $? -ne 0 ] ; then
|
||||
warn "Could not set maximum file descriptor limit: $MAX_FD"
|
||||
fi
|
||||
else
|
||||
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
|
||||
fi
|
||||
fi
|
||||
|
||||
# For Darwin, add options to specify how the application appears in the dock
|
||||
if $darwin; then
|
||||
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
||||
fi
|
||||
|
||||
# For Cygwin or MSYS, switch paths to Windows format before running java
|
||||
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
|
||||
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
||||
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
||||
|
||||
JAVACMD=`cygpath --unix "$JAVACMD"`
|
||||
|
||||
# We build the pattern for arguments to be converted via cygpath
|
||||
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
||||
SEP=""
|
||||
for dir in $ROOTDIRSRAW ; do
|
||||
ROOTDIRS="$ROOTDIRS$SEP$dir"
|
||||
SEP="|"
|
||||
done
|
||||
OURCYGPATTERN="(^($ROOTDIRS))"
|
||||
# Add a user-defined pattern to the cygpath arguments
|
||||
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
|
||||
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
|
||||
fi
|
||||
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
||||
i=0
|
||||
for arg in "$@" ; do
|
||||
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
|
||||
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
||||
|
||||
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
||||
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
|
||||
else
|
||||
eval `echo args$i`="\"$arg\""
|
||||
fi
|
||||
i=`expr $i + 1`
|
||||
done
|
||||
case $i in
|
||||
0) set -- ;;
|
||||
1) set -- "$args0" ;;
|
||||
2) set -- "$args0" "$args1" ;;
|
||||
3) set -- "$args0" "$args1" "$args2" ;;
|
||||
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
||||
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
||||
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
||||
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
||||
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
||||
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Escape application args
|
||||
save () {
|
||||
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
||||
echo " "
|
||||
}
|
||||
APP_ARGS=`save "$@"`
|
||||
|
||||
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
||||
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
||||
|
||||
exec "$JAVACMD" "$@"
|
89
examples/whisper.android/gradlew.bat
vendored
89
examples/whisper.android/gradlew.bat
vendored
@ -1,89 +0,0 @@
|
||||
@rem
|
||||
@rem Copyright 2015 the original author or authors.
|
||||
@rem
|
||||
@rem Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@rem you may not use this file except in compliance with the License.
|
||||
@rem You may obtain a copy of the License at
|
||||
@rem
|
||||
@rem https://www.apache.org/licenses/LICENSE-2.0
|
||||
@rem
|
||||
@rem Unless required by applicable law or agreed to in writing, software
|
||||
@rem distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@rem See the License for the specific language governing permissions and
|
||||
@rem limitations under the License.
|
||||
@rem
|
||||
|
||||
@if "%DEBUG%" == "" @echo off
|
||||
@rem ##########################################################################
|
||||
@rem
|
||||
@rem Gradle startup script for Windows
|
||||
@rem
|
||||
@rem ##########################################################################
|
||||
|
||||
@rem Set local scope for the variables with windows NT shell
|
||||
if "%OS%"=="Windows_NT" setlocal
|
||||
|
||||
set DIRNAME=%~dp0
|
||||
if "%DIRNAME%" == "" set DIRNAME=.
|
||||
set APP_BASE_NAME=%~n0
|
||||
set APP_HOME=%DIRNAME%
|
||||
|
||||
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
|
||||
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
|
||||
|
||||
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
||||
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
|
||||
|
||||
@rem Find java.exe
|
||||
if defined JAVA_HOME goto findJavaFromJavaHome
|
||||
|
||||
set JAVA_EXE=java.exe
|
||||
%JAVA_EXE% -version >NUL 2>&1
|
||||
if "%ERRORLEVEL%" == "0" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:findJavaFromJavaHome
|
||||
set JAVA_HOME=%JAVA_HOME:"=%
|
||||
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
||||
|
||||
if exist "%JAVA_EXE%" goto execute
|
||||
|
||||
echo.
|
||||
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
||||
echo.
|
||||
echo Please set the JAVA_HOME variable in your environment to match the
|
||||
echo location of your Java installation.
|
||||
|
||||
goto fail
|
||||
|
||||
:execute
|
||||
@rem Setup the command line
|
||||
|
||||
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
||||
|
||||
|
||||
@rem Execute Gradle
|
||||
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
|
||||
|
||||
:end
|
||||
@rem End local scope for the variables with windows NT shell
|
||||
if "%ERRORLEVEL%"=="0" goto mainEnd
|
||||
|
||||
:fail
|
||||
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
||||
rem the _cmd.exe /c_ return code!
|
||||
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
||||
exit /b 1
|
||||
|
||||
:mainEnd
|
||||
if "%OS%"=="Windows_NT" endlocal
|
||||
|
||||
:omega
|
@ -1,10 +0,0 @@
|
||||
## This file is automatically generated by Android Studio.
|
||||
# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
|
||||
#
|
||||
# This file should *NOT* be checked into Version Control Systems,
|
||||
# as it contains information specific to your local configuration.
|
||||
#
|
||||
# Location of the SDK. This is only used by Gradle.
|
||||
# For customization when using a Version Control System, please read the
|
||||
# header note.
|
||||
sdk.dir=/Users/kevin/Library/Android/sdk
|
@ -1,16 +0,0 @@
|
||||
pluginManagement {
|
||||
repositories {
|
||||
gradlePluginPortal()
|
||||
google()
|
||||
mavenCentral()
|
||||
}
|
||||
}
|
||||
dependencyResolutionManagement {
|
||||
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
|
||||
repositories {
|
||||
google()
|
||||
mavenCentral()
|
||||
}
|
||||
}
|
||||
rootProject.name = "WhisperCppDemo"
|
||||
include ':app'
|
@ -5,10 +5,6 @@ The inference runs locally, on-device.
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
|
||||
|
||||
Real-time transcription demo:
|
||||
|
||||
https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-bca8-0e46d9da2364.mp4
|
||||
|
||||
## Usage
|
||||
|
||||
```java
|
||||
|
@ -309,7 +309,6 @@
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
@ -337,7 +336,6 @@
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
|
@ -1,8 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
||||
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
||||
<device id="retina6_0" orientation="portrait" appearance="light"/>
|
||||
<dependencies>
|
||||
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
|
||||
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
|
||||
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
|
||||
<capability name="System colors in document resources" minToolsVersion="11.0"/>
|
||||
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
|
||||
@ -40,7 +40,7 @@
|
||||
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||
<color key="textColor" systemColor="labelColor"/>
|
||||
<fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
|
||||
<fontDescription key="fontDescription" type="system" pointSize="20"/>
|
||||
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
|
||||
</textView>
|
||||
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
|
||||
@ -56,18 +56,6 @@
|
||||
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
|
||||
</connections>
|
||||
</button>
|
||||
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
|
||||
<rect key="frame" x="199" y="191" width="156" height="49"/>
|
||||
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||
<color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
|
||||
<color key="tintColor" systemColor="opaqueSeparatorColor"/>
|
||||
<state key="normal" title="Real-time">
|
||||
<color key="titleColor" systemColor="labelColor"/>
|
||||
</state>
|
||||
<connections>
|
||||
<action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
|
||||
</connections>
|
||||
</button>
|
||||
</subviews>
|
||||
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
|
||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||
@ -76,7 +64,6 @@
|
||||
</constraints>
|
||||
</view>
|
||||
<connections>
|
||||
<outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
|
||||
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
|
||||
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
|
||||
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user