mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-24 17:15:19 +00:00
Compare commits
3 Commits
v1.3.0
...
coreml-wit
Author | SHA1 | Date | |
---|---|---|---|
0244810697 | |||
6efb04fc72 | |||
ee0d6ff473 |
41
.github/workflows/build.yml
vendored
41
.github/workflows/build.yml
vendored
@ -265,44 +265,3 @@ jobs:
|
||||
popd
|
||||
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
make
|
||||
|
||||
ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure
|
||||
run: cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
|
||||
|
||||
- name: Build objc example
|
||||
run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
|
||||
- name: Build swiftui example
|
||||
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
|
||||
android:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v3
|
||||
with:
|
||||
distribution: zulu
|
||||
java-version: 17
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v2
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/whisper.android
|
||||
./gradlew assembleRelease --no-daemon
|
9
.gitignore
vendored
9
.gitignore
vendored
@ -1,8 +1,8 @@
|
||||
*.o
|
||||
*.a
|
||||
*.mlmodel
|
||||
*.mlmodelc
|
||||
.cache/
|
||||
.coreml/
|
||||
.test/
|
||||
.vs/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
@ -20,7 +20,6 @@ build-sanitize-thread/
|
||||
/stream
|
||||
/command
|
||||
/talk
|
||||
/talk-llama
|
||||
/bench
|
||||
|
||||
arm_neon.h
|
||||
@ -35,7 +34,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
|
||||
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
||||
|
||||
extra/bench-gg.txt
|
||||
|
||||
models/*.mlmodel
|
||||
models/*.mlmodelc
|
||||
models/*.mlpackage
|
||||
|
@ -1,10 +1,6 @@
|
||||
cmake_minimum_required (VERSION 3.0)
|
||||
|
||||
project(whisper.cpp VERSION 1.3.0)
|
||||
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||
add_compile_options(/utf-8)
|
||||
endif ()
|
||||
project(whisper.cpp VERSION 1.2.1)
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
Copyright (c) 2022 Georgi Gerganov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
44
Makefile
44
Makefile
@ -36,7 +36,7 @@ LDFLAGS =
|
||||
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
|
||||
ifneq ($(wildcard /usr/include/musl/*),)
|
||||
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
||||
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
||||
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
||||
endif
|
||||
|
||||
@ -77,6 +77,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
@ -88,17 +92,16 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
endif
|
||||
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
@ -110,11 +113,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
|
||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
@ -157,15 +155,12 @@ ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
CXXFLAGS += -mcpu=native
|
||||
endif
|
||||
ifneq ($(filter armv6%,$(UNAME_M)),)
|
||||
# 32-bit Raspberry Pi 1, 2, 3
|
||||
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
|
||||
# Raspberry Pi 1, 2, 3
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
ifneq ($(filter armv7%,$(UNAME_M)),)
|
||||
# 32-bit ARM, for example on Armbian or possibly raspbian
|
||||
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
|
||||
# 64-bit ARM, use these (TODO: auto-detect 64-bit)
|
||||
# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
# Raspberry Pi 4
|
||||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
|
||||
endif
|
||||
ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
# Raspberry Pi 4
|
||||
@ -187,7 +182,7 @@ $(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
default: main bench
|
||||
default: main
|
||||
|
||||
#
|
||||
# Build library
|
||||
@ -196,7 +191,7 @@ default: main bench
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
|
||||
whisper.o: whisper.cpp whisper.h ggml.h
|
||||
whisper.o: whisper.cpp whisper.h
|
||||
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
|
||||
|
||||
ifndef WHISPER_COREML
|
||||
@ -218,7 +213,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
|
||||
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
|
||||
|
||||
#
|
||||
# Examples
|
||||
@ -233,9 +228,6 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
|
||||
|
||||
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
@ -245,8 +237,8 @@ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(W
|
||||
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
||||
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Audio samples
|
||||
|
71
README.md
71
README.md
@ -4,12 +4,12 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Beta: [v1.3.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.3.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
- Plain C/C++ implementation without dependencies
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate framework and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
|
||||
- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
|
||||
- AVX intrinsics support for x86 architectures
|
||||
- VSX intrinsics support for POWER architectures
|
||||
- Mixed F16 / F32 precision
|
||||
@ -58,9 +58,7 @@ the Accelerate framework utilizes the special-purpose AMX coprocessor available
|
||||
|
||||
## Quick start
|
||||
|
||||
First clone the repository.
|
||||
|
||||
Then, download one of the Whisper models converted in [ggml format](models). For example:
|
||||
First, download one of the Whisper models converted in [ggml format](models). For example:
|
||||
|
||||
```bash
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
@ -225,60 +223,6 @@ make large
|
||||
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||
|
||||
## Core ML support
|
||||
|
||||
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
|
||||
speed-up - more than x3 faster compared with CPU-only execution. Here are the instructions for generating a Core ML model and using it with `whisper.cpp`:
|
||||
|
||||
- Install Python dependencies needed for the creation of the Core ML model:
|
||||
|
||||
```bash
|
||||
pip install ane_transformers
|
||||
pip install openai-whisper
|
||||
pip install coremltools
|
||||
```
|
||||
|
||||
- Generate a Core ML model. For example, to generate a `base.en` model, use:
|
||||
|
||||
```bash
|
||||
./models/generate-coreml-model.sh base.en
|
||||
```
|
||||
|
||||
This will generate the folder `models/ggml-base.en-encoder.mlmodelc`
|
||||
|
||||
- Build `whisper.cpp` with Core ML support:
|
||||
|
||||
```bash
|
||||
# using Makefile
|
||||
make clean
|
||||
WHISPER_COREML=1 make -j
|
||||
|
||||
# using CMake
|
||||
cd build
|
||||
cmake -DWHISPER_COREML=1 ..
|
||||
```
|
||||
|
||||
- Run the examples as usual. For example:
|
||||
|
||||
```bash
|
||||
./main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
|
||||
...
|
||||
|
||||
whisper_init_state: loading Core ML model from 'models/ggml-base.en-encoder.mlmodelc'
|
||||
whisper_init_state: first run on a device may take a while ...
|
||||
whisper_init_state: Core ML model loaded
|
||||
|
||||
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
|
||||
Next runs are faster.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## Limitations
|
||||
|
||||
- Inference only
|
||||
@ -369,7 +313,7 @@ whisper_print_timings: total time = 32733.52 ms
|
||||
## Real-time audio input example
|
||||
|
||||
This is a naive example of performing real-time inference on audio from your microphone.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
|
||||
```java
|
||||
@ -384,10 +328,6 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
|
||||
Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
|
||||
to highlight words with high or low confidence:
|
||||
|
||||
```java
|
||||
./main -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
|
||||
```
|
||||
|
||||
<img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
|
||||
|
||||
## Controlling the length of the generated text segments (experimental)
|
||||
@ -540,7 +480,6 @@ in [models](models).
|
||||
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
|
||||
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
|
||||
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
@ -548,7 +487,6 @@ in [models](models).
|
||||
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
|
||||
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
|
||||
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
|
||||
- [X] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
|
||||
|
||||
## Examples
|
||||
|
||||
@ -562,7 +500,6 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
|
||||
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
|
||||
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
|
||||
| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
|
||||
| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
|
||||
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
|
||||
| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
|
||||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
|
@ -105,10 +105,6 @@ func (p *Params) SetMaxSegmentLength(n int) {
|
||||
p.max_len = C.int(n)
|
||||
}
|
||||
|
||||
func (p *Params) SetTokenTimestamps(b bool) {
|
||||
p.token_timestamps = toBool(b)
|
||||
}
|
||||
|
||||
// Set max tokens per segment (0 = no limit)
|
||||
func (p *Params) SetMaxTokensPerSegment(n int) {
|
||||
p.max_tokens = C.int(n)
|
||||
|
@ -111,11 +111,6 @@ func (context *context) SetMaxSegmentLength(n uint) {
|
||||
context.params.SetMaxSegmentLength(int(n))
|
||||
}
|
||||
|
||||
// Set token timestamps flag
|
||||
func (context *context) SetTokenTimestamps(b bool) {
|
||||
context.params.SetTokenTimestamps(b)
|
||||
}
|
||||
|
||||
// Set max tokens per segment (0 = no limit)
|
||||
func (context *context) SetMaxTokensPerSegment(n uint) {
|
||||
context.params.SetMaxTokensPerSegment(int(n))
|
||||
@ -285,14 +280,10 @@ func toSegment(ctx *whisper.Context, n int) Segment {
|
||||
func toTokens(ctx *whisper.Context, n int) []Token {
|
||||
result := make([]Token, ctx.Whisper_full_n_tokens(n))
|
||||
for i := 0; i < len(result); i++ {
|
||||
data := ctx.Whisper_full_get_token_data(n, i)
|
||||
|
||||
result[i] = Token{
|
||||
Id: int(ctx.Whisper_full_get_token_id(n, i)),
|
||||
Text: ctx.Whisper_full_get_token_text(n, i),
|
||||
P: ctx.Whisper_full_get_token_p(n, i),
|
||||
Start: time.Duration(data.T0()) * time.Millisecond * 10,
|
||||
End: time.Duration(data.T1()) * time.Millisecond * 10,
|
||||
Id: int(ctx.Whisper_full_get_token_id(n, i)),
|
||||
Text: strings.TrimSpace(ctx.Whisper_full_get_token_text(n, i)),
|
||||
P: ctx.Whisper_full_get_token_p(n, i),
|
||||
}
|
||||
}
|
||||
return result
|
||||
|
@ -41,7 +41,6 @@ type Context interface {
|
||||
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
||||
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
||||
SetMaxSegmentLength(uint) // Set max segment length in characters
|
||||
SetTokenTimestamps(bool) // Set token timestamps flag
|
||||
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
|
||||
|
||||
// Process mono audio data and return any errors.
|
||||
@ -86,8 +85,7 @@ type Segment struct {
|
||||
|
||||
// Token is a text or special token
|
||||
type Token struct {
|
||||
Id int
|
||||
Text string
|
||||
P float32
|
||||
Start, End time.Duration
|
||||
Id int
|
||||
Text string
|
||||
P float32
|
||||
}
|
||||
|
@ -356,7 +356,7 @@ func (ctx *Context) Whisper_full_get_token_id(segment int, token int) Token {
|
||||
|
||||
// Get token data for the specified token in the specified segment.
|
||||
// This contains probabilities, timestamps, etc.
|
||||
func (ctx *Context) Whisper_full_get_token_data(segment int, token int) TokenData {
|
||||
func (ctx *Context) whisper_full_get_token_data(segment int, token int) TokenData {
|
||||
return TokenData(C.whisper_full_get_token_data((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
|
||||
}
|
||||
|
||||
@ -407,11 +407,3 @@ func callEncoderBegin(user_data unsafe.Pointer) C.bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t TokenData) T0() int64 {
|
||||
return int64(t.t0)
|
||||
}
|
||||
|
||||
func (t TokenData) T1() int64 {
|
||||
return int64(t.t1)
|
||||
}
|
||||
|
Submodule bindings/ios updated: fce433c4b7...92d4c5c9a0
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.3.0",
|
||||
"version": "1.2.1",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,146 +0,0 @@
|
||||
//
|
||||
// whisper-decoder-impl.h
|
||||
//
|
||||
// This file was automatically generated and should not be edited.
|
||||
//
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <CoreML/CoreML.h>
|
||||
#include <stdint.h>
|
||||
#include <os/log.h>
|
||||
|
||||
NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
|
||||
/// Model Prediction Input Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// token_data as 1 by 1 matrix of 32-bit integers
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * token_data;
|
||||
|
||||
/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Model Prediction Output Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// var_1346 as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Class for model loading and prediction
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_impl : NSObject
|
||||
@property (readonly, nonatomic, nullable) MLModel * model;
|
||||
|
||||
/**
|
||||
URL of the underlying .mlmodelc directory.
|
||||
*/
|
||||
+ (nullable NSURL *)URLOfModelInThisBundle;
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from an existing MLModel object.
|
||||
|
||||
Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance with the model in this bundle.
|
||||
*/
|
||||
- (nullable instancetype)init;
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance with the model in this bundle.
|
||||
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Construct whisper_decoder_impl instance asynchronously with configuration.
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
|
||||
/**
|
||||
Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param modelURL The model URL.
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@param input an instance of whisper_decoder_implInput to predict from
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_decoder_implOutput
|
||||
*/
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@param input an instance of whisper_decoder_implInput to predict from
|
||||
@param options prediction options
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_decoder_implOutput
|
||||
*/
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make a prediction using the convenience interface
|
||||
@param token_data as 1 by 1 matrix of 32-bit integers:
|
||||
@param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_decoder_implOutput
|
||||
*/
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Batch prediction
|
||||
@param inputArray array of whisper_decoder_implInput instances to obtain predictions from
|
||||
@param options prediction options
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the predictions as NSArray<whisper_decoder_implOutput *>
|
||||
*/
|
||||
- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
@end
|
||||
|
||||
NS_ASSUME_NONNULL_END
|
@ -1,201 +0,0 @@
|
||||
//
|
||||
// whisper-decoder-impl.m
|
||||
//
|
||||
// This file was automatically generated and should not be edited.
|
||||
//
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
|
||||
#endif
|
||||
|
||||
#import "whisper-decoder-impl.h"
|
||||
|
||||
@implementation whisper_decoder_implInput
|
||||
|
||||
- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_token_data = token_data;
|
||||
_audio_data = audio_data;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSSet<NSString *> *)featureNames {
|
||||
return [NSSet setWithArray:@[@"token_data", @"audio_data"]];
|
||||
}
|
||||
|
||||
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
||||
if ([featureName isEqualToString:@"token_data"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.token_data];
|
||||
}
|
||||
if ([featureName isEqualToString:@"audio_data"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.audio_data];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation whisper_decoder_implOutput
|
||||
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_var_1346 = var_1346;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSSet<NSString *> *)featureNames {
|
||||
return [NSSet setWithArray:@[@"var_1346"]];
|
||||
}
|
||||
|
||||
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
||||
if ([featureName isEqualToString:@"var_1346"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation whisper_decoder_impl
|
||||
|
||||
|
||||
/**
|
||||
URL of the underlying .mlmodelc directory.
|
||||
*/
|
||||
+ (nullable NSURL *)URLOfModelInThisBundle {
|
||||
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"];
|
||||
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; }
|
||||
return [NSURL fileURLWithPath:assetPath];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from an existing MLModel object.
|
||||
|
||||
Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model {
|
||||
self = [super init];
|
||||
if (!self) { return nil; }
|
||||
_model = model;
|
||||
if (_model == nil) { return nil; }
|
||||
return self;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance with the model in this bundle.
|
||||
*/
|
||||
- (nullable instancetype)init {
|
||||
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance with the model in this bundle.
|
||||
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
|
||||
if (model == nil) { return nil; }
|
||||
return [self initWithMLModel:model];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_decoder_impl instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
|
||||
if (model == nil) { return nil; }
|
||||
return [self initWithMLModel:model];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Construct whisper_decoder_impl instance asynchronously with configuration.
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
||||
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
|
||||
configuration:configuration
|
||||
completionHandler:handler];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param modelURL The model URL.
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
||||
[MLModel loadContentsOfURL:modelURL
|
||||
configuration:configuration
|
||||
completionHandler:^(MLModel *model, NSError *error) {
|
||||
if (model != nil) {
|
||||
whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model];
|
||||
handler(typedModel, nil);
|
||||
} else {
|
||||
handler(nil, error);
|
||||
}
|
||||
}];
|
||||
}
|
||||
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
|
||||
}
|
||||
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
||||
if (!outFeatures) { return nil; }
|
||||
return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
|
||||
}
|
||||
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data];
|
||||
return [self predictionFromFeatures:input_ error:error];
|
||||
}
|
||||
|
||||
- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
|
||||
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
|
||||
if (!outBatch) { return nil; }
|
||||
NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
||||
for (NSInteger i = 0; i < outBatch.count; i++) {
|
||||
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
||||
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
|
||||
[results addObject:result];
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
@end
|
@ -1,5 +1,5 @@
|
||||
//
|
||||
// whisper-encoder-impl.h
|
||||
// CoremlEncoder.h
|
||||
//
|
||||
// This file was automatically generated and should not be edited.
|
||||
//
|
||||
@ -13,20 +13,20 @@ NS_ASSUME_NONNULL_BEGIN
|
||||
|
||||
|
||||
/// Model Prediction Input Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * logmel_data;
|
||||
/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER;
|
||||
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
||||
/// Model Prediction Output Type
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// output as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * output;
|
||||
@ -37,8 +37,8 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
|
||||
|
||||
/// Class for model loading and prediction
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_encoder_impl : NSObject
|
||||
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
|
||||
@interface CoremlEncoder : NSObject
|
||||
@property (readonly, nonatomic, nullable) MLModel * model;
|
||||
|
||||
/**
|
||||
@ -47,20 +47,20 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
+ (nullable NSURL *)URLOfModelInThisBundle;
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from an existing MLModel object.
|
||||
Initialize CoremlEncoder instance from an existing MLModel object.
|
||||
|
||||
Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
|
||||
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance with the model in this bundle.
|
||||
Initialize CoremlEncoder instance with the model in this bundle.
|
||||
*/
|
||||
- (nullable instancetype)init;
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance with the model in this bundle.
|
||||
Initialize CoremlEncoder instance with the model in this bundle.
|
||||
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@ -68,75 +68,75 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from the model URL.
|
||||
Initialize CoremlEncoder instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
||||
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from the model URL.
|
||||
Initialize CoremlEncoder instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
||||
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Construct whisper_encoder_impl instance asynchronously with configuration.
|
||||
Construct CoremlEncoder instance asynchronously with configuration.
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param modelURL The model URL.
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@param input an instance of whisper_encoder_implInput to predict from
|
||||
@param input an instance of CoremlEncoderInput to predict from
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_encoder_implOutput
|
||||
@return the prediction as CoremlEncoderOutput
|
||||
*/
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make a prediction using the standard interface
|
||||
@param input an instance of whisper_encoder_implInput to predict from
|
||||
@param input an instance of CoremlEncoderInput to predict from
|
||||
@param options prediction options
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_encoder_implOutput
|
||||
@return the prediction as CoremlEncoderOutput
|
||||
*/
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Make a prediction using the convenience interface
|
||||
@param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
|
||||
@param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the prediction as whisper_encoder_implOutput
|
||||
@return the prediction as CoremlEncoderOutput
|
||||
*/
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
|
||||
/**
|
||||
Batch prediction
|
||||
@param inputArray array of whisper_encoder_implInput instances to obtain predictions from
|
||||
@param inputArray array of CoremlEncoderInput instances to obtain predictions from
|
||||
@param options prediction options
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@return the predictions as NSArray<whisper_encoder_implOutput *>
|
||||
@return the predictions as NSArray<CoremlEncoderOutput *>
|
||||
*/
|
||||
- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
||||
@end
|
||||
|
||||
NS_ASSUME_NONNULL_END
|
||||
|
@ -1,5 +1,5 @@
|
||||
//
|
||||
// whisper-encoder-impl.m
|
||||
// CoremlEncoder.m
|
||||
//
|
||||
// This file was automatically generated and should not be edited.
|
||||
//
|
||||
@ -10,30 +10,30 @@
|
||||
|
||||
#import "whisper-encoder-impl.h"
|
||||
|
||||
@implementation whisper_encoder_implInput
|
||||
@implementation CoremlEncoderInput
|
||||
|
||||
- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data {
|
||||
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_logmel_data = logmel_data;
|
||||
_melSegment = melSegment;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSSet<NSString *> *)featureNames {
|
||||
return [NSSet setWithArray:@[@"logmel_data"]];
|
||||
return [NSSet setWithArray:@[@"melSegment"]];
|
||||
}
|
||||
|
||||
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
||||
if ([featureName isEqualToString:@"logmel_data"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.logmel_data];
|
||||
if ([featureName isEqualToString:@"melSegment"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation whisper_encoder_implOutput
|
||||
@implementation CoremlEncoderOutput
|
||||
|
||||
- (instancetype)initWithOutput:(MLMultiArray *)output {
|
||||
self = [super init];
|
||||
@ -56,23 +56,23 @@
|
||||
|
||||
@end
|
||||
|
||||
@implementation whisper_encoder_impl
|
||||
@implementation CoremlEncoder
|
||||
|
||||
|
||||
/**
|
||||
URL of the underlying .mlmodelc directory.
|
||||
*/
|
||||
+ (nullable NSURL *)URLOfModelInThisBundle {
|
||||
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"];
|
||||
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; }
|
||||
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
|
||||
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
|
||||
return [NSURL fileURLWithPath:assetPath];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from an existing MLModel object.
|
||||
Initialize CoremlEncoder instance from an existing MLModel object.
|
||||
|
||||
Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
|
||||
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
|
||||
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
||||
*/
|
||||
- (instancetype)initWithMLModel:(MLModel *)model {
|
||||
@ -85,7 +85,7 @@
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance with the model in this bundle.
|
||||
Initialize CoremlEncoder instance with the model in this bundle.
|
||||
*/
|
||||
- (nullable instancetype)init {
|
||||
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
|
||||
@ -93,7 +93,7 @@
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance with the model in this bundle.
|
||||
Initialize CoremlEncoder instance with the model in this bundle.
|
||||
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
@ -104,9 +104,9 @@
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from the model URL.
|
||||
Initialize CoremlEncoder instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
||||
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
@ -117,9 +117,9 @@
|
||||
|
||||
|
||||
/**
|
||||
Initialize whisper_encoder_impl instance from the model URL.
|
||||
Initialize CoremlEncoder instance from the model URL.
|
||||
|
||||
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
||||
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
|
||||
@param configuration The model configuration object
|
||||
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
||||
*/
|
||||
@ -131,13 +131,13 @@
|
||||
|
||||
|
||||
/**
|
||||
Construct whisper_encoder_impl instance asynchronously with configuration.
|
||||
Construct CoremlEncoder instance asynchronously with configuration.
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
|
||||
*/
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
||||
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
|
||||
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
|
||||
configuration:configuration
|
||||
completionHandler:handler];
|
||||
@ -145,20 +145,20 @@
|
||||
|
||||
|
||||
/**
|
||||
Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
||||
|
||||
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
||||
|
||||
@param modelURL The model URL.
|
||||
@param configuration The model configuration
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
||||
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
|
||||
*/
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
||||
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
|
||||
[MLModel loadContentsOfURL:modelURL
|
||||
configuration:configuration
|
||||
completionHandler:^(MLModel *model, NSError *error) {
|
||||
if (model != nil) {
|
||||
whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model];
|
||||
CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
|
||||
handler(typedModel, nil);
|
||||
} else {
|
||||
handler(nil, error);
|
||||
@ -166,29 +166,29 @@
|
||||
}];
|
||||
}
|
||||
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
|
||||
}
|
||||
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
||||
if (!outFeatures) { return nil; }
|
||||
return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
|
||||
return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
|
||||
}
|
||||
|
||||
- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
|
||||
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
|
||||
return [self predictionFromFeatures:input_ error:error];
|
||||
}
|
||||
|
||||
- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
|
||||
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
|
||||
if (!outBatch) { return nil; }
|
||||
NSMutableArray<whisper_encoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
||||
NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
||||
for (NSInteger i = 0; i < outBatch.count; i++) {
|
||||
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
||||
whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
|
||||
CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
|
||||
[results addObject:result];
|
||||
}
|
||||
return results;
|
||||
|
@ -18,7 +18,7 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
|
||||
|
||||
NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
|
||||
|
||||
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
|
||||
const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
|
||||
|
||||
if (data == NULL) {
|
||||
return NULL;
|
||||
@ -49,16 +49,10 @@ void whisper_coreml_encode(
|
||||
error: nil
|
||||
];
|
||||
|
||||
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
|
||||
CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
|
||||
|
||||
MLMultiArray * outMA = outCoreML.output;
|
||||
|
||||
//NSArray<NSNumber *> * shape = outMA.shape;
|
||||
//NSArray<NSNumber *> * strides = outMA.strides;
|
||||
|
||||
//printf("shape: %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
|
||||
//printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
|
||||
|
||||
memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
|
||||
}
|
||||
|
||||
|
@ -63,5 +63,4 @@ else()
|
||||
add_subdirectory(command)
|
||||
add_subdirectory(bench)
|
||||
add_subdirectory(talk)
|
||||
add_subdirectory(talk-llama)
|
||||
endif()
|
||||
|
@ -1,22 +1,15 @@
|
||||
const path = require("path");
|
||||
const { whisper } = require(path.join(
|
||||
__dirname,
|
||||
"../../../build/Release/whisper-addon"
|
||||
));
|
||||
const { promisify } = require("util");
|
||||
|
||||
const whisperAsync = promisify(whisper);
|
||||
const path = require('path');
|
||||
const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
|
||||
|
||||
const whisperParamsMock = {
|
||||
language: "en",
|
||||
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
|
||||
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
|
||||
language: 'en',
|
||||
model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
|
||||
fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
|
||||
};
|
||||
|
||||
describe("Run whisper.node", () => {
|
||||
test("it should receive a non-empty value", async () => {
|
||||
let result = await whisperAsync(whisperParamsMock);
|
||||
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
});
|
||||
test("it should receive a non-empty value", () => {
|
||||
expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
@ -160,6 +160,22 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
// initial prompt
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
prompt_tokens.resize(1024);
|
||||
prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
|
||||
fprintf(stderr, "initial tokens: [ ");
|
||||
for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
|
||||
fprintf(stderr, "%d ", prompt_tokens[i]);
|
||||
}
|
||||
fprintf(stderr, "]\n");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
@ -227,7 +243,8 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
|
||||
// Lowest is red, middle is yellow, highest is green.
|
||||
@ -57,7 +56,7 @@ struct whisper_params {
|
||||
int32_t duration_ms = 0;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t best_of = 2;
|
||||
int32_t best_of = 5;
|
||||
int32_t beam_size = -1;
|
||||
|
||||
float word_thold = 0.01f;
|
||||
@ -75,7 +74,6 @@ struct whisper_params {
|
||||
bool output_wts = false;
|
||||
bool output_csv = false;
|
||||
bool output_jsn = false;
|
||||
bool output_lrc = false;
|
||||
bool print_special = false;
|
||||
bool print_colors = false;
|
||||
bool print_progress = false;
|
||||
@ -131,7 +129,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
||||
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
|
||||
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
||||
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
||||
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
||||
@ -180,7 +177,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
||||
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
||||
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
||||
fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false");
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
||||
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
||||
@ -211,8 +207,8 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
||||
|
||||
std::string speaker = "";
|
||||
|
||||
int64_t t0 = 0;
|
||||
int64_t t1 = 0;
|
||||
int64_t t0;
|
||||
int64_t t1;
|
||||
|
||||
// print the last n_new segments
|
||||
const int s0 = n_segments - n_new;
|
||||
@ -375,39 +371,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char *escape_double_quotes(const char *str) {
|
||||
if (str == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t escaped_length = strlen(str) + 1;
|
||||
|
||||
for (size_t i = 0; str[i] != '\0'; i++) {
|
||||
if (str[i] == '"') {
|
||||
escaped_length++;
|
||||
}
|
||||
}
|
||||
|
||||
char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
|
||||
if (escaped == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; str[i] != '\0'; i++) {
|
||||
if (str[i] == '"') {
|
||||
escaped[pos++] = '\\';
|
||||
escaped[pos++] = '"';
|
||||
} else {
|
||||
escaped[pos++] = str[i];
|
||||
}
|
||||
}
|
||||
|
||||
// no need to set zero due to calloc() being used prior
|
||||
|
||||
return escaped;
|
||||
}
|
||||
|
||||
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
|
||||
std::ofstream fout(fname);
|
||||
int indent = 0;
|
||||
@ -451,9 +414,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
||||
|
||||
auto value_s = [&](const char *name, const char *val, bool end = false) {
|
||||
start_value(name);
|
||||
char * val_escaped = escape_double_quotes(val);
|
||||
fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
|
||||
free(val_escaped);
|
||||
fout << "\"" << val << (end ? "\"\n" : "\",\n");
|
||||
};
|
||||
|
||||
auto end_value = [&](bool end = false) {
|
||||
@ -494,7 +455,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
||||
value_i("ctx", whisper_model_n_text_ctx(ctx));
|
||||
value_i("state", whisper_model_n_text_state(ctx));
|
||||
value_i("head", whisper_model_n_text_head(ctx));
|
||||
value_i("layer", whisper_model_n_text_layer(ctx), true);
|
||||
value_i("leyer", whisper_model_n_text_layer(ctx), true);
|
||||
end_obj();
|
||||
value_i("mels", whisper_model_n_mels(ctx));
|
||||
value_i("f16", whisper_model_f16(ctx), true);
|
||||
@ -516,7 +477,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
start_obj();
|
||||
start_obj("timestamps");
|
||||
start_obj("timestanps");
|
||||
value_s("from", to_timestamp(t0, true).c_str());
|
||||
value_s("to", to_timestamp(t1, true).c_str(), true);
|
||||
end_obj();
|
||||
@ -650,39 +611,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
return true;
|
||||
}
|
||||
|
||||
bool output_lrc(struct whisper_context * ctx, const char * fname) {
|
||||
|
||||
std::ofstream fout(fname);
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
fout << "[by:whisper.cpp]\n";
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t = whisper_full_get_segment_t0(ctx, i);
|
||||
|
||||
int64_t msec = t * 10;
|
||||
int64_t min = msec / (1000 * 60);
|
||||
msec = msec - min * (1000 * 60);
|
||||
int64_t sec = msec / 1000;
|
||||
msec = msec - sec * 1000;
|
||||
|
||||
char buf[16];
|
||||
snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) ( msec / 10));
|
||||
std::string timestamp_lrc = std::string(buf);
|
||||
|
||||
fout << '[' << timestamp_lrc << ']' << text << "\n";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
@ -711,6 +639,22 @@ int main(int argc, char ** argv) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
// initial prompt
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
prompt_tokens.resize(1024);
|
||||
prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
|
||||
fprintf(stderr, "initial tokens: [ ");
|
||||
for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
|
||||
fprintf(stderr, "%d ", prompt_tokens[i]);
|
||||
}
|
||||
fprintf(stderr, "]\n");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
@ -774,7 +718,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
@ -849,12 +794,6 @@ int main(int argc, char ** argv) {
|
||||
const auto fname_jsn = fname_out + ".json";
|
||||
output_json(ctx, fname_jsn.c_str(), params);
|
||||
}
|
||||
|
||||
// output to LRC file
|
||||
if (params.output_lrc) {
|
||||
const auto fname_lrc = fname_out + ".lrc";
|
||||
output_lrc(ctx, fname_lrc.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,6 @@ struct whisper_params {
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool no_fallback = false;
|
||||
bool print_special = false;
|
||||
bool no_context = true;
|
||||
bool no_timestamps = false;
|
||||
@ -74,7 +73,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
@ -96,23 +94,22 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
|
||||
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
|
||||
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
|
||||
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
|
||||
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -151,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// whisper init
|
||||
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
@ -300,8 +297,7 @@ int main(int argc, char ** argv) {
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
// disable temperature fallback
|
||||
//wparams.temperature_inc = -1.0f;
|
||||
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
|
||||
wparams.temperature_inc = -1.0f;
|
||||
|
||||
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
|
||||
|
1
examples/talk-llama/.gitignore
vendored
1
examples/talk-llama/.gitignore
vendored
@ -1 +0,0 @@
|
||||
audio.mp3
|
@ -1,16 +0,0 @@
|
||||
if (WHISPER_SUPPORT_SDL2)
|
||||
# talk-llama
|
||||
set(TARGET talk-llama)
|
||||
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
||||
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||
#target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# TODO: this is temporary
|
||||
# need to export ggml symbols for MSVC, but too lazy ..
|
||||
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
|
||||
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
include(DefaultTargetOptions)
|
||||
endif ()
|
@ -1,36 +0,0 @@
|
||||
# talk-llama
|
||||
|
||||
Talk with an LLaMA AI in your terminal
|
||||
|
||||
[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
|
||||
|
||||
## Building
|
||||
|
||||
The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
|
||||
|
||||
```bash
|
||||
# Install SDL2 on Linux
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
# Install SDL2 on Mac OS
|
||||
brew install sdl2
|
||||
|
||||
# Build the "talk-llama" executable
|
||||
make talk-llama
|
||||
|
||||
# Run it
|
||||
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
|
||||
```
|
||||
|
||||
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
|
||||
- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
|
||||
|
||||
## TTS
|
||||
|
||||
For best experience, this example needs a TTS tool to convert the generated text responses to voice.
|
||||
You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
|
||||
By default, it is configured to use MacOS's `say`, but you can use whatever you wish.
|
||||
|
||||
## Discussion
|
||||
|
||||
If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1
|
@ -1,23 +0,0 @@
|
||||
import sys
|
||||
import importlib.util
|
||||
|
||||
api_key = "" #Write your https://beta.elevenlabs.io api key here
|
||||
if not api_key:
|
||||
print("To use elevenlabs you have to register to https://beta.elevenlabs.io and add your elevenlabs api key to examples/talk-llama/eleven-labs.py")
|
||||
sys.exit()
|
||||
|
||||
if importlib.util.find_spec("elevenlabs") is None:
|
||||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
|
||||
sys.exit()
|
||||
|
||||
from elevenlabs import ElevenLabs
|
||||
eleven = ElevenLabs(api_key)
|
||||
|
||||
# Get a Voice object, by name or UUID
|
||||
voice = eleven.voices["Arnold"] #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
|
||||
|
||||
# Generate the TTS
|
||||
audio = voice.generate(str(sys.argv[2:]))
|
||||
|
||||
# Save the TTS to a file
|
||||
audio.save("audio")
|
File diff suppressed because it is too large
Load Diff
@ -1,173 +0,0 @@
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define LLAMA_API __declspec(dllexport)
|
||||
# else
|
||||
# define LLAMA_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define LLAMA_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// C interface
|
||||
//
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
|
||||
float p; // probability of the token
|
||||
float plog; // log probability of the token
|
||||
|
||||
} llama_token_data;
|
||||
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_parts; // -1 for default
|
||||
int seed; // RNG seed, 0 for random
|
||||
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
int itype);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
// Returns the size of the KV cache
|
||||
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
LLAMA_API void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos();
|
||||
LLAMA_API llama_token llama_token_eos();
|
||||
|
||||
// TODO: improve the last_n_tokens interface ?
|
||||
LLAMA_API llama_token llama_sample_top_p_top_k(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // LLAMA_H
|
@ -1,12 +0,0 @@
|
||||
// Internal header to be included by llama.cpp and tests/benchmarks only.
|
||||
|
||||
#ifndef LLAMA_INTERNAL_H
|
||||
#define LLAMA_INTERNAL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
struct ggml_tensor;
|
||||
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif // LLAMA_INTERNAL_H
|
@ -1,383 +0,0 @@
|
||||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
LLAMA_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
};
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw format("failed to open %s: %s", fname, std::strerror(errno));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw format("read error: %s", strerror(errno));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::string("unexpectedly reached end of file");
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw format("write error: %s", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
#ifdef __linux__
|
||||
flags |= MAP_POPULATE;
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
close(fd);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw format("mmap failed: %s", strerror(errno));
|
||||
}
|
||||
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
CloseHandle(hFile);
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
||||
}
|
||||
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file *) {
|
||||
throw std::string("mmap not supported");
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * addr) {
|
||||
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
|
||||
this->addr = addr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
LLAMA_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
|
||||
size, this->size, std::strerror(errno));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * addr, size_t size) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(addr, size)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
size, this->size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = size + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += size;
|
||||
max_ws_size += size;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (!VirtualUnlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
void raw_lock(const void * addr, size_t size) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t size) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct llama_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] addr;
|
||||
addr = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llama_buffer() {
|
||||
delete[] addr;
|
||||
}
|
||||
};
|
||||
#endif
|
@ -1,23 +0,0 @@
|
||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
|
||||
Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
|
||||
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
|
||||
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
|
||||
The transcript only includes text, it does not include markup like HTML and Markdown.
|
||||
{1} responds with short and concise answers.
|
||||
|
||||
### Response:
|
||||
|
||||
{0}{4} Hello, {1}!
|
||||
{1}{4} Hello {0}! How may I help you today?
|
||||
{0}{4} What time is it?
|
||||
{1}{4} It is {2} o'clock.
|
||||
{0}{4} What year is it?
|
||||
{1}{4} We are in {3}.
|
||||
{0}{4} What is a cat?
|
||||
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
|
||||
{0}{4} Name a color.
|
||||
{1}{4} Blue
|
||||
{0}{4}
|
@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage:
|
||||
# speak.sh <voice_id> <text-to-speak>
|
||||
|
||||
# espeak
|
||||
# Mac OS: brew install espeak
|
||||
# Linux: apt-get install espeak
|
||||
#
|
||||
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
|
||||
|
||||
# for Mac
|
||||
say "$2"
|
||||
|
||||
# Eleven Labs
|
||||
# To use it, install the elevenlabs module from pip (pip install elevenlabs), register to https://beta.elevenlabs.io to get an api key and paste it in /examples/talk-llama/eleven-labs.py
|
||||
#
|
||||
#wd=$(dirname $0)
|
||||
#script=$wd/eleven-labs.py
|
||||
#python3 $script $1 "$2" >/dev/null 2>&1
|
||||
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
|
@ -1,550 +0,0 @@
|
||||
// Talk with AI
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "whisper.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int)add_bos);
|
||||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t voice_ms = 10000;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
int32_t n_parts_llama = -1;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool print_energy = false;
|
||||
bool no_timestamps = true;
|
||||
bool verbose_prompt = false;
|
||||
|
||||
std::string person = "Georgi";
|
||||
std::string language = "en";
|
||||
std::string model_wsp = "models/ggml-base.en.bin";
|
||||
std::string model_llama = "models/ggml-llama-7B.bin";
|
||||
std::string speak = "./examples/talk-llama/speak.sh";
|
||||
std::string prompt = "";
|
||||
std::string fname_out;
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
||||
else if (arg == "--verbose-prompt") { params.verbose_prompt = true; }
|
||||
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
||||
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
|
||||
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
||||
else if (arg == "--prompt-file") {
|
||||
std::ifstream file(argv[++i]);
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
}
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
||||
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
||||
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
||||
fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
|
||||
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
||||
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
||||
fprintf(stderr, " --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
std::string transcribe(
|
||||
whisper_context * ctx,
|
||||
const whisper_params & params,
|
||||
const std::vector<float> & pcmf32,
|
||||
const std::string prompt_text,
|
||||
float & prob,
|
||||
int64_t & t_ms) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
prob = 0.0f;
|
||||
t_ms = 0;
|
||||
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
prompt_tokens.resize(1024);
|
||||
prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
int prob_n = 0;
|
||||
std::string result;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
result += text;
|
||||
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const auto token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
prob += token.p;
|
||||
++prob_n;
|
||||
}
|
||||
}
|
||||
|
||||
if (prob_n > 0) {
|
||||
prob /= prob_n;
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
|
||||
|
||||
const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
|
||||
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
|
||||
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
|
||||
The transcript only includes text, it does not include markup like HTML and Markdown.
|
||||
{1} responds with short and concise answers.
|
||||
|
||||
{0}{4} Hello, {1}!
|
||||
{1}{4} Hello {0}! How may I help you today?
|
||||
{0}{4} What time is it?
|
||||
{1}{4} It is {2} o'clock.
|
||||
{0}{4} What year is it?
|
||||
{1}{4} We are in {3}.
|
||||
{0}{4} What is a cat?
|
||||
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
|
||||
{0}{4} Name a color.
|
||||
{1}{4} Blue
|
||||
{0}{4})";
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
|
||||
struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
|
||||
|
||||
// llama init
|
||||
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
// tune these to your liking
|
||||
lparams.n_ctx = 2048;
|
||||
lparams.seed = 1;
|
||||
lparams.f16_kv = true;
|
||||
lparams.n_parts = params.n_parts_llama;
|
||||
|
||||
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
if (!whisper_is_multilingual(ctx_wsp)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__,
|
||||
params.n_threads,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
||||
// init audio
|
||||
|
||||
audio_async audio(30*1000);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
|
||||
int n_iter = 0;
|
||||
|
||||
bool is_running = true;
|
||||
bool force_speak = false;
|
||||
|
||||
float prob0 = 0.0f;
|
||||
|
||||
const std::string chat_symb = ":";
|
||||
const std::string bot_name = "LLaMA";
|
||||
|
||||
std::vector<float> pcmf32_cur;
|
||||
std::vector<float> pcmf32_prompt;
|
||||
|
||||
const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
|
||||
|
||||
// construct the initial prompt for LLaMA inference
|
||||
std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
|
||||
|
||||
// need to have leading ' '
|
||||
prompt_llama.insert(0, 1, ' ');
|
||||
|
||||
prompt_llama = ::replace(prompt_llama, "{0}", params.person);
|
||||
prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
|
||||
|
||||
{
|
||||
// get time string
|
||||
std::string time_str;
|
||||
{
|
||||
time_t t = time(0);
|
||||
struct tm * now = localtime(&t);
|
||||
char buf[128];
|
||||
strftime(buf, sizeof(buf), "%H:%M", now);
|
||||
time_str = buf;
|
||||
}
|
||||
prompt_llama = ::replace(prompt_llama, "{2}", time_str);
|
||||
}
|
||||
|
||||
{
|
||||
// get year string
|
||||
std::string year_str;
|
||||
{
|
||||
time_t t = time(0);
|
||||
struct tm * now = localtime(&t);
|
||||
char buf[128];
|
||||
strftime(buf, sizeof(buf), "%Y", now);
|
||||
year_str = buf;
|
||||
}
|
||||
prompt_llama = ::replace(prompt_llama, "{3}", year_str);
|
||||
}
|
||||
|
||||
prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
|
||||
|
||||
// evaluate the initial prompt
|
||||
|
||||
auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
|
||||
|
||||
printf("\n");
|
||||
printf("%s : initializing - please wait ...\n", __func__);
|
||||
|
||||
if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "%s", prompt_llama.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
printf("%s : done! start speaking in the microphone\n", __func__);
|
||||
printf("\n");
|
||||
printf("%s%s", params.person.c_str(), chat_symb.c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// clear audio buffer
|
||||
audio.clear();
|
||||
|
||||
// text inference variables
|
||||
const int voice_id = 2;
|
||||
const int n_keep = embd_inp.size();
|
||||
const int n_ctx = llama_n_ctx(ctx_llama);
|
||||
|
||||
int n_past = n_keep;
|
||||
int n_prev = 64; // TODO arg
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
// reverse prompts for detecting when it's time to stop speaking
|
||||
std::vector<std::string> antiprompts = {
|
||||
params.person + chat_symb,
|
||||
};
|
||||
|
||||
// main loop
|
||||
while (is_running) {
|
||||
// handle Ctrl + C
|
||||
is_running = sdl_poll_events();
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
|
||||
// delay
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
int64_t t_ms = 0;
|
||||
|
||||
{
|
||||
audio.get(2000, pcmf32_cur);
|
||||
|
||||
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
|
||||
//fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
||||
|
||||
audio.get(params.voice_ms, pcmf32_cur);
|
||||
|
||||
std::string text_heard;
|
||||
|
||||
if (!force_speak) {
|
||||
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
|
||||
}
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\[.*?\\]");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove text between brackets using regex
|
||||
{
|
||||
std::regex re("\\(.*?\\)");
|
||||
text_heard = std::regex_replace(text_heard, re, "");
|
||||
}
|
||||
|
||||
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
|
||||
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
||||
|
||||
// take first line
|
||||
text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
|
||||
|
||||
// remove leading and trailing whitespace
|
||||
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
|
||||
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
|
||||
|
||||
const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
|
||||
|
||||
if (text_heard.empty() || tokens.empty() || force_speak) {
|
||||
//fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
|
||||
audio.clear();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
force_speak = false;
|
||||
|
||||
text_heard.insert(0, 1, ' ');
|
||||
text_heard += "\n" + bot_name + chat_symb;
|
||||
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
|
||||
fflush(stdout);
|
||||
|
||||
embd = ::llama_tokenize(ctx_llama, text_heard, false);
|
||||
|
||||
// text inference
|
||||
bool done = false;
|
||||
std::string text_to_speak;
|
||||
while (true) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
n_past = n_keep;
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
|
||||
|
||||
//printf("\n---\n");
|
||||
//printf("resetting: '");
|
||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
|
||||
//}
|
||||
//printf("'\n");
|
||||
//printf("\n---\n");
|
||||
}
|
||||
|
||||
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
|
||||
n_past += embd.size();
|
||||
embd.clear();
|
||||
|
||||
if (done) break;
|
||||
|
||||
{
|
||||
// out of user input, sample next token
|
||||
const float top_k = 5;
|
||||
const float top_p = 0.80f;
|
||||
const float temp = 0.30f;
|
||||
const float repeat_penalty = 1.1764f;
|
||||
|
||||
const int repeat_last_n = 256;
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
{
|
||||
auto logits = llama_get_logits(ctx_llama);
|
||||
logits[llama_token_eos()] = 0;
|
||||
|
||||
id = llama_sample_top_p_top_k(ctx_llama,
|
||||
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
||||
repeat_last_n, top_k, top_p, temp, repeat_penalty);
|
||||
}
|
||||
|
||||
if (id != llama_token_eos()) {
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
text_to_speak += llama_token_to_str(ctx_llama, id);
|
||||
|
||||
printf("%s", llama_token_to_str(ctx_llama, id));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::string last_output;
|
||||
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
|
||||
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
|
||||
}
|
||||
last_output += llama_token_to_str(ctx_llama, embd[0]);
|
||||
|
||||
for (std::string & antiprompt : antiprompts) {
|
||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||
done = true;
|
||||
text_to_speak = ::replace(text_to_speak, antiprompt, "");
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
is_running = sdl_poll_events();
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
|
||||
audio.clear();
|
||||
|
||||
++n_iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audio.pause();
|
||||
|
||||
whisper_print_timings(ctx_wsp);
|
||||
whisper_free(ctx_wsp);
|
||||
|
||||
llama_print_timings(ctx_llama);
|
||||
llama_free(ctx_llama);
|
||||
|
||||
return 0;
|
||||
}
|
@ -841,7 +841,6 @@ struct gpt2_context * gpt2_init(const char * path_model) {
|
||||
|
||||
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
|
||||
delete ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
2
examples/talk/.gitignore
vendored
2
examples/talk/.gitignore
vendored
@ -1 +1 @@
|
||||
audio.mp3
|
||||
eleven-labs.py
|
||||
|
@ -1,23 +0,0 @@
|
||||
import sys
|
||||
import importlib.util
|
||||
|
||||
api_key = "" #Write your https://beta.elevenlabs.io api key here
|
||||
if not api_key:
|
||||
print("To use elevenlabs you have to register to https://beta.elevenlabs.io and add your elevenlabs api key to examples/talk/eleven-labs.py")
|
||||
sys.exit()
|
||||
|
||||
if importlib.util.find_spec("elevenlabs") is None:
|
||||
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
|
||||
sys.exit()
|
||||
|
||||
from elevenlabs import ElevenLabs
|
||||
eleven = ElevenLabs(api_key)
|
||||
|
||||
# Get a Voice object, by name or UUID
|
||||
voice = eleven.voices["Arnold"] #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
|
||||
|
||||
# Generate the TTS
|
||||
audio = voice.generate(str(sys.argv[2:]))
|
||||
|
||||
# Save the TTS to a file
|
||||
audio.save("audio")
|
@ -7,13 +7,9 @@
|
||||
# Mac OS: brew install espeak
|
||||
# Linux: apt-get install espeak
|
||||
#
|
||||
#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
|
||||
|
||||
# Mac OS "say" command
|
||||
say "$2"
|
||||
espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
|
||||
|
||||
# Eleven Labs
|
||||
# To use it, install the elevenlabs module from pip (pip install elevenlabs), register to https://beta.elevenlabs.io to get an api key and paste it in /examples/talk/eleven-labs.py
|
||||
#
|
||||
#wd=$(dirname $0)
|
||||
#script=$wd/eleven-labs.py
|
||||
|
@ -1,18 +1,15 @@
|
||||
A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
|
||||
See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).
|
||||
|
||||
**Usage**:
|
||||
To use:
|
||||
|
||||
1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
|
||||
2. Add the model to `whisper.swiftui.demo/Resources/models` **via Xcode**.
|
||||
2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
|
||||
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
|
||||
4. Add the sample audio file to `whisper.swiftui.demo/Resources/samples` **via Xcode**.
|
||||
4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
|
||||
5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
|
||||
|
||||
**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
|
||||
|
||||
[^1]: I recommend the tiny, base or small models for running on an iOS device.
|
||||
|
||||
[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
|
||||
|
||||

|
||||
|
@ -64,10 +64,6 @@ for model in "${models[@]}"; do
|
||||
config="$config BLAS"
|
||||
fi
|
||||
|
||||
if [[ $system_info == *"COREML = 1"* ]]; then
|
||||
config="$config COREML"
|
||||
fi
|
||||
|
||||
commit=$(git rev-parse --short HEAD)
|
||||
|
||||
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
||||
|
164
ggml.h
164
ggml.h
@ -177,12 +177,11 @@ extern "C" {
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
// we use the built-in 16-bit float type
|
||||
@ -199,14 +198,11 @@ struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
||||
enum ggml_type {
|
||||
// explicitly numbered values are used in llama.cpp files
|
||||
GGML_TYPE_F32 = 0,
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
@ -230,15 +226,12 @@ enum ggml_op {
|
||||
GGML_OP_STEP,
|
||||
GGML_OP_RELU,
|
||||
GGML_OP_GELU,
|
||||
GGML_OP_SILU,
|
||||
GGML_OP_NORM, // normalize
|
||||
GGML_OP_RMS_NORM,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
|
||||
GGML_OP_SCALE,
|
||||
GGML_OP_CPY,
|
||||
GGML_OP_CONT,
|
||||
GGML_OP_RESHAPE,
|
||||
GGML_OP_VIEW,
|
||||
GGML_OP_PERMUTE,
|
||||
@ -253,35 +246,19 @@ enum ggml_op {
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
int n_dims;
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
int ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
|
||||
// compute data
|
||||
enum ggml_op op;
|
||||
@ -335,7 +312,6 @@ struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
void ggml_time_init(void); // call this once at the beginning of the program
|
||||
@ -347,13 +323,10 @@ int64_t ggml_cycles_per_ms(void);
|
||||
void ggml_print_object (const struct ggml_object * obj);
|
||||
void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
size_t ggml_type_size (enum ggml_type type);
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
@ -367,33 +340,33 @@ struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t *ne);
|
||||
const int *ne);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0);
|
||||
int ne0);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
int ne0,
|
||||
int ne1);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
@ -493,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_silu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_rms_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: m rows, n columns
|
||||
// B: p rows, n columns (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
@ -531,11 +496,6 @@ struct ggml_tensor * ggml_cpy(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// make contiguous
|
||||
struct ggml_tensor * ggml_cont(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return view(a), b specifies the new shape
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape(
|
||||
@ -548,43 +508,33 @@ struct ggml_tensor * ggml_reshape(
|
||||
struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
int ne0,
|
||||
int ne1);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
|
||||
// offset in bytes
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int ne0,
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int ne0,
|
||||
int ne1,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@ -655,21 +605,6 @@ struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
@ -791,13 +726,6 @@ enum ggml_opt_result ggml_opt(
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
@ -815,30 +743,6 @@ int ggml_cpu_has_blas(void);
|
||||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -39,7 +39,6 @@ import json
|
||||
import code
|
||||
import torch
|
||||
import numpy as np
|
||||
import base64
|
||||
|
||||
#from transformers import GPTJForCausalLM
|
||||
#from transformers import GPT2TokenizerFast
|
||||
@ -225,14 +224,18 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
||||
#code.interact(local=locals())
|
||||
|
||||
multilingual = hparams["n_vocab"] == 51865
|
||||
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
|
||||
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
|
||||
|
||||
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
||||
#print(tokenizer)
|
||||
#print(tokenizer.name_or_path)
|
||||
#print(len(tokenizer.additional_special_tokens))
|
||||
|
||||
# output in the same directory as the model
|
||||
fname_out = dir_out + "/ggml-model.bin"
|
||||
|
||||
with open(tokenizer, "rb") as f:
|
||||
contents = f.read()
|
||||
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
|
||||
with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
|
||||
tokens = json.load(f)
|
||||
|
||||
# use 16-bit or 32-bit floats
|
||||
use_f16 = True
|
||||
@ -268,8 +271,9 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
|
||||
fout.write(struct.pack("i", len(tokens)))
|
||||
|
||||
for key in tokens:
|
||||
fout.write(struct.pack("i", len(key)))
|
||||
fout.write(key)
|
||||
text = bytearray([byte_decoder[c] for c in key])
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
|
||||
for name in list_vars.keys():
|
||||
data = list_vars[name].squeeze().numpy()
|
||||
|
@ -1,334 +0,0 @@
|
||||
import argparse
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import coremltools as ct
|
||||
|
||||
from torch import Tensor
|
||||
from torch import nn
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
|
||||
from coremltools.models.neural_network.quantization_utils import quantize_weights
|
||||
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
|
||||
from whisper import load_model
|
||||
|
||||
# Use for changing dim of input in encoder and decoder embeddings
|
||||
def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
|
||||
missing_keys, unexpected_keys, error_msgs):
|
||||
"""
|
||||
Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
|
||||
"""
|
||||
for k in state_dict:
|
||||
is_attention = all(substr in k for substr in ['attn', '.weight'])
|
||||
is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
|
||||
|
||||
if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
|
||||
state_dict[k] = state_dict[k][:, :, None, None]
|
||||
|
||||
|
||||
def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
|
||||
strict, missing_keys,
|
||||
unexpected_keys, error_msgs):
|
||||
state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
|
||||
return state_dict
|
||||
|
||||
class LayerNormANE(LayerNormANEBase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._register_load_state_dict_pre_hook(
|
||||
correct_for_bias_scale_order_inversion)
|
||||
|
||||
class MultiHeadAttentionANE(MultiHeadAttention):
|
||||
def __init__(self, n_state: int, n_head: int):
|
||||
super().__init__(n_state, n_head)
|
||||
|
||||
setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
|
||||
setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
|
||||
setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
|
||||
setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
|
||||
|
||||
def forward(self,
|
||||
x: Tensor,
|
||||
xa: Optional[Tensor] = None,
|
||||
mask: Optional[Tensor] = None,
|
||||
kv_cache: Optional[dict] = None):
|
||||
|
||||
q = self.query(x)
|
||||
|
||||
if kv_cache is None or xa is None or self.key not in kv_cache:
|
||||
# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
|
||||
# otherwise, perform key/value projections for self- or cross-attention as usual.
|
||||
k = self.key(x if xa is None else xa)
|
||||
v = self.value(x if xa is None else xa)
|
||||
|
||||
else:
|
||||
# for cross-attention, calculate keys and values once and reuse in subsequent calls.
|
||||
k = kv_cache[self.key]
|
||||
v = kv_cache[self.value]
|
||||
|
||||
wv, qk = self.qkv_attention_ane(q, k, v, mask)
|
||||
|
||||
return self.out(wv), qk
|
||||
|
||||
def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
|
||||
|
||||
_, dim, _, seqlen = q.size()
|
||||
|
||||
dim_per_head = dim // self.n_head
|
||||
|
||||
scale = float(dim_per_head)**-0.5
|
||||
|
||||
q = q * scale
|
||||
|
||||
mh_q = q.split(dim_per_head, dim=1)
|
||||
mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
|
||||
mh_v = v.split(dim_per_head, dim=1)
|
||||
|
||||
mh_qk = [
|
||||
torch.einsum('bchq,bkhc->bkhq', [qi, ki])
|
||||
for qi, ki in zip(mh_q, mh_k)
|
||||
] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
|
||||
|
||||
if mask is not None:
|
||||
for head_idx in range(self.n_head):
|
||||
mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
|
||||
|
||||
attn_weights = [aw.softmax(dim=1) for aw in mh_qk] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
|
||||
attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)] # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
|
||||
attn = torch.cat(attn, dim=1) # (batch_size, dim, 1, max_seq_length)
|
||||
|
||||
return attn, torch.cat(mh_qk, dim=1).float().detach()
|
||||
|
||||
|
||||
class ResidualAttentionBlockANE(ResidualAttentionBlock):
|
||||
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
|
||||
super().__init__(n_state, n_head, cross_attention)
|
||||
|
||||
setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
|
||||
setattr(self, 'attn_ln', LayerNormANE(n_state))
|
||||
|
||||
setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
|
||||
setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
|
||||
|
||||
n_mlp = n_state * 4
|
||||
setattr(self, 'mlp', nn.Sequential(
|
||||
nn.Conv2d(n_state, n_mlp, kernel_size=1),
|
||||
nn.GELU(),
|
||||
nn.Conv2d(n_mlp, n_state, kernel_size=1)
|
||||
))
|
||||
setattr(self, 'mlp_ln', LayerNormANE(n_state))
|
||||
|
||||
|
||||
class AudioEncoderANE(AudioEncoder):
|
||||
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
|
||||
super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
|
||||
|
||||
setattr(self, 'blocks', nn.ModuleList(
|
||||
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
|
||||
))
|
||||
setattr(self, 'ln_post', LayerNormANE(n_state))
|
||||
|
||||
def forward(self, x: Tensor):
|
||||
"""
|
||||
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
|
||||
the mel spectrogram of the audio
|
||||
"""
|
||||
x = F.gelu(self.conv1(x))
|
||||
x = F.gelu(self.conv2(x))
|
||||
|
||||
assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
|
||||
|
||||
# Add positional embedding and add dummy dim for ANE
|
||||
x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
|
||||
|
||||
for block in self.blocks:
|
||||
x = block(x)
|
||||
|
||||
x = self.ln_post(x)
|
||||
|
||||
# """
|
||||
# TODO:
|
||||
# I think we need to transpose the result here to make it fit whisper.cpp memory order.
|
||||
# However, even doing this, the results are still wrong. Kind of less wrong compared to
|
||||
# not transposing, but still wrong.
|
||||
|
||||
# Also, I don't know why the original OpenAI implementation does not need to transpose
|
||||
|
||||
# transpose to (batch_size, n_ctx, n_state)
|
||||
# x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
|
||||
|
||||
# """
|
||||
# x = x.transpose(1,3)
|
||||
|
||||
return x
|
||||
|
||||
class TextDecoderANE(TextDecoder):
|
||||
|
||||
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
|
||||
super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
|
||||
|
||||
setattr(self, 'blocks', nn.ModuleList(
|
||||
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
|
||||
))
|
||||
setattr(self, 'ln', LayerNormANE(n_state))
|
||||
|
||||
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
|
||||
"""
|
||||
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
|
||||
the text tokens
|
||||
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
|
||||
the encoded audio features to be attended on
|
||||
"""
|
||||
offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
|
||||
x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
|
||||
x = x.to(xa.dtype)
|
||||
|
||||
# Reformat for ANE
|
||||
mask = self.mask[None, None, :, :].permute(0,3,1,2)
|
||||
x = x.transpose(1,2).unsqueeze(2)
|
||||
|
||||
for block in self.blocks:
|
||||
x = block(x, xa, mask=mask, kv_cache=kv_cache)
|
||||
|
||||
x = self.ln(x)
|
||||
|
||||
# Reformat back from ANE
|
||||
x = x.permute(0,2,3,1).squeeze(0)
|
||||
|
||||
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
|
||||
if self.token_embedding.weight.shape[0] == 51865:
|
||||
# split in 11 chunks - 4715 each
|
||||
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
|
||||
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
||||
else:
|
||||
# split in 12 chunks - 4322 each
|
||||
assert(self.token_embedding.weight.shape[0] == 51864)
|
||||
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
|
||||
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
||||
|
||||
return logits
|
||||
|
||||
class WhisperANE(Whisper):
|
||||
def __init__(self, dims: ModelDimensions):
|
||||
super().__init__(dims)
|
||||
|
||||
setattr(self, 'encoder', AudioEncoderANE(
|
||||
self.dims.n_mels,
|
||||
self.dims.n_audio_ctx,
|
||||
self.dims.n_audio_state,
|
||||
self.dims.n_audio_head,
|
||||
self.dims.n_audio_layer,
|
||||
))
|
||||
setattr(self, 'decoder', TextDecoderANE(
|
||||
self.dims.n_vocab,
|
||||
self.dims.n_text_ctx,
|
||||
self.dims.n_text_state,
|
||||
self.dims.n_text_head,
|
||||
self.dims.n_text_layer,
|
||||
))
|
||||
|
||||
self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
|
||||
|
||||
def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
|
||||
return self.decoder(tokens, self.encoder(mel))
|
||||
|
||||
def install_kv_cache_hooks(self, cache: Optional[dict] = None):
|
||||
cache = {**cache} if cache is not None else {}
|
||||
hooks = []
|
||||
|
||||
def save_to_cache(module, _, output):
|
||||
if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
|
||||
cache[module] = output # save as-is, for the first token or cross attention
|
||||
else:
|
||||
cache[module] = torch.cat([cache[module], output], dim=3).detach()
|
||||
return cache[module]
|
||||
|
||||
def install_hooks(layer: nn.Module):
|
||||
if isinstance(layer, MultiHeadAttentionANE):
|
||||
hooks.append(layer.key.register_forward_hook(save_to_cache))
|
||||
hooks.append(layer.value.register_forward_hook(save_to_cache))
|
||||
|
||||
self.decoder.apply(install_hooks)
|
||||
return cache, hooks
|
||||
|
||||
def convert_encoder(hparams, model, quantize=False):
|
||||
model.eval()
|
||||
|
||||
input_shape = (1, 80, 3000)
|
||||
input_data = torch.randn(input_shape)
|
||||
traced_model = torch.jit.trace(model, input_data)
|
||||
|
||||
model = ct.convert(
|
||||
traced_model,
|
||||
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
|
||||
inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
|
||||
outputs=[ct.TensorType(name="output")],
|
||||
compute_units=ct.ComputeUnit.ALL
|
||||
)
|
||||
|
||||
if quantize:
|
||||
model = quantize_weights(model, nbits=16)
|
||||
|
||||
return model
|
||||
|
||||
def convert_decoder(hparams, model, quantize=False):
|
||||
model.eval()
|
||||
|
||||
tokens_shape = (1, 1)
|
||||
audio_shape = (1, hparams.n_audio_state, 1, 1500)
|
||||
|
||||
audio_data = torch.randn(audio_shape)
|
||||
token_data = torch.randint(50257, tokens_shape).long()
|
||||
traced_model = torch.jit.trace(model, (token_data, audio_data))
|
||||
|
||||
model = ct.convert(
|
||||
traced_model,
|
||||
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
|
||||
inputs=[
|
||||
ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
|
||||
ct.TensorType(name="audio_data", shape=audio_shape)
|
||||
]
|
||||
)
|
||||
|
||||
if quantize:
|
||||
model = quantize_weights(model, nbits=16)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True)
|
||||
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
||||
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
||||
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]:
|
||||
raise ValueError("Invalid model name")
|
||||
|
||||
whisper = load_model(args.model).cpu()
|
||||
hparams = whisper.dims
|
||||
print(hparams)
|
||||
|
||||
if args.optimize_ane:
|
||||
whisperANE = WhisperANE(hparams).eval()
|
||||
whisperANE.load_state_dict(whisper.state_dict())
|
||||
|
||||
encoder = whisperANE.encoder
|
||||
decoder = whisperANE.decoder
|
||||
else:
|
||||
encoder = whisper.encoder
|
||||
decoder = whisper.decoder
|
||||
|
||||
# Convert encoder
|
||||
encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
|
||||
encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
|
||||
|
||||
if args.encoder_only is False:
|
||||
# Convert decoder
|
||||
decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
|
||||
decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
|
||||
|
||||
print("done converting")
|
@ -12,7 +12,7 @@ pfx="resolve/main/ggml"
|
||||
# get the path of this script
|
||||
function get_script_path() {
|
||||
if [ -x "$(command -v realpath)" ]; then
|
||||
echo "$(dirname "$(realpath "$0")")"
|
||||
echo "$(dirname $(realpath $0))"
|
||||
else
|
||||
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
|
||||
echo "$ret"
|
||||
|
@ -1,29 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This generates:
|
||||
# - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
|
||||
# - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
|
||||
#
|
||||
|
||||
wd=$(dirname "$0")
|
||||
cd "$wd/../"
|
||||
|
||||
python3 models/convert-whisper-to-coreml.py --model tiny.en
|
||||
|
||||
mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
|
||||
xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
|
||||
mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
|
||||
mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
|
||||
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
|
||||
sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
|
||||
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
|
||||
|
||||
mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
|
||||
xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
|
||||
mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
|
||||
mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
|
||||
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
|
||||
sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
|
||||
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
|
||||
|
||||
rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage
|
@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage: ./generate-coreml-model.sh <model-name>
|
||||
if [ $# -eq 0 ]
|
||||
then
|
||||
echo "No model name supplied"
|
||||
echo "Usage: ./generate-coreml-model.sh <model-name>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mname="$1"
|
||||
|
||||
wd=$(dirname "$0")
|
||||
cd "$wd/../"
|
||||
|
||||
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
|
||||
|
||||
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
|
||||
rm -rf models/ggml-${mname}-encoder.mlmodelc
|
||||
mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
|
||||
|
||||
# TODO: decoder (sometime in the future maybe)
|
||||
#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
|
||||
#rm -rf models/ggml-${mname}-decoder.mlmodelc
|
||||
#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc
|
469
whisper.cpp
469
whisper.cpp
@ -221,14 +221,14 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
||||
{ "su", { 98, "sundanese", } },
|
||||
};
|
||||
|
||||
static const size_t MB = 1ull*1024*1024;
|
||||
static const size_t MB = 1024*1024;
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_TINY, 14ull*MB },
|
||||
{ MODEL_BASE, 18ull*MB },
|
||||
{ MODEL_SMALL, 28ull*MB },
|
||||
{ MODEL_MEDIUM, 36ull*MB },
|
||||
{ MODEL_LARGE, 44ull*MB },
|
||||
{ MODEL_TINY, 12ull*MB },
|
||||
{ MODEL_BASE, 15ull*MB },
|
||||
{ MODEL_SMALL, 23ull*MB },
|
||||
{ MODEL_MEDIUM, 31ull*MB },
|
||||
{ MODEL_LARGE, 38ull*MB },
|
||||
};
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
||||
@ -297,7 +297,6 @@ static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|
||||
|
||||
struct whisper_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
||||
std::vector<float> data;
|
||||
@ -590,7 +589,6 @@ struct whisper_state {
|
||||
|
||||
int lang_id = 0; // english by default
|
||||
|
||||
std::string path_model; // populated by whisper_init_from_file()
|
||||
#ifdef WHISPER_USE_COREML
|
||||
whisper_coreml_context * ctx_coreml;
|
||||
#endif
|
||||
@ -663,11 +661,9 @@ static bool kv_cache_init(
|
||||
int n_ctx) {
|
||||
cache.buf.resize(mem_bytes);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ cache.buf.size(),
|
||||
/*.mem_buffer =*/ cache.buf.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size();
|
||||
params.mem_buffer = cache.buf.data();
|
||||
|
||||
cache.ctx = ggml_init(params);
|
||||
|
||||
@ -699,11 +695,9 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
||||
|
||||
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ cache.buf.size(),
|
||||
/*.mem_buffer =*/ cache.buf.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size();
|
||||
params.mem_buffer = cache.buf.data();
|
||||
|
||||
cache.ctx = ggml_init(params);
|
||||
|
||||
@ -1041,11 +1035,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ wctx.model.buf->size(),
|
||||
/*.mem_buffer =*/ wctx.model.buf->data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = wctx.model.buf->size();
|
||||
params.mem_buffer = wctx.model.buf->data();
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
@ -1294,7 +1286,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
|
||||
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1359,11 +1351,9 @@ static bool whisper_encode_internal(
|
||||
const int n_mels = hparams.n_mels;
|
||||
assert(mel_inp.n_mel == n_mels);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ wstate.buf_compute.size(),
|
||||
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = wstate.buf_compute.size();
|
||||
params.mem_buffer = wstate.buf_compute.data();
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
@ -1519,7 +1509,8 @@ static bool whisper_encode_internal(
|
||||
Vcur,
|
||||
n_state/n_head, n_head, n_ctx),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
|
||||
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
|
||||
);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
||||
#else
|
||||
@ -1750,12 +1741,10 @@ static bool whisper_encode_internal(
|
||||
|
||||
wstate.use_buf(ctx0, -1);
|
||||
|
||||
Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
|
||||
( n_ctx)*ggml_element_size(wstate.kv_cross.v),
|
||||
(il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
|
||||
//struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
||||
//struct ggml_tensor * v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
||||
struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
||||
struct ggml_tensor* v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*n_ctx));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
||||
@ -1769,10 +1758,10 @@ static bool whisper_encode_internal(
|
||||
|
||||
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
||||
// ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
// wctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
@ -1823,11 +1812,9 @@ static bool whisper_decode_internal(
|
||||
|
||||
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ wstate.buf_compute.size(),
|
||||
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = wstate.buf_compute.size();
|
||||
params.mem_buffer = wstate.buf_compute.data();
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
@ -1871,6 +1858,8 @@ static bool whisper_decode_internal(
|
||||
|
||||
// self-attention
|
||||
{
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_q_w,
|
||||
cur);
|
||||
@ -1890,24 +1879,20 @@ static bool whisper_decode_internal(
|
||||
|
||||
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_v_w,
|
||||
cur);
|
||||
|
||||
Vcur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0,
|
||||
layer.attn_v_b,
|
||||
Vcur),
|
||||
Vcur);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||
layer.attn_v_w,
|
||||
cur);
|
||||
|
||||
Vcur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0,
|
||||
layer.attn_v_b,
|
||||
Vcur),
|
||||
Vcur);
|
||||
|
||||
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
@ -1936,6 +1921,8 @@ static bool whisper_decode_internal(
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
wstate.use_buf(ctx0, 0);
|
||||
|
||||
//struct ggml_tensor * KQ_scaled =
|
||||
// ggml_scale(ctx0,
|
||||
// KQ,
|
||||
@ -1944,16 +1931,22 @@ static bool whisper_decode_internal(
|
||||
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
n_past + N, n_state/n_head, n_head,
|
||||
n_ctx*ggml_element_size(kv_self.v),
|
||||
n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
|
||||
il*n_ctx*ggml_element_size(kv_self.v)*n_state);
|
||||
wstate.use_buf(ctx0, 0);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
|
||||
n_state/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3);
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
@ -1988,6 +1981,8 @@ static bool whisper_decode_internal(
|
||||
|
||||
cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
// cur = ln_0_w*cur + ln_0_b
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
@ -1998,6 +1993,8 @@ static bool whisper_decode_internal(
|
||||
|
||||
// cross-attention
|
||||
{
|
||||
wstate.use_buf(ctx0, 0);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
|
||||
layer.cross_attn_q_w,
|
||||
cur);
|
||||
@ -2016,25 +2013,17 @@ static bool whisper_decode_internal(
|
||||
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
|
||||
n_state/n_head, n_head, M);
|
||||
|
||||
//struct ggml_tensor * Vcross =
|
||||
// ggml_reshape_3d(ctx0,
|
||||
// ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
|
||||
// n_state/n_head, n_head, M);
|
||||
struct ggml_tensor * Vcross =
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
|
||||
n_state/n_head, n_head, M);
|
||||
|
||||
//struct ggml_tensor * V_trans =
|
||||
// ggml_cpy(ctx0,
|
||||
// ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
|
||||
// ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, wstate.kv_cross.v,
|
||||
M, n_state/n_head, n_head,
|
||||
M*ggml_element_size(wstate.kv_cross.v),
|
||||
M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
|
||||
il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
|
||||
struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);
|
||||
|
||||
// ------
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
@ -2044,6 +2033,8 @@ static bool whisper_decode_internal(
|
||||
|
||||
struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
|
||||
|
||||
wstate.use_buf(ctx0, 0);
|
||||
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
@ -2056,9 +2047,15 @@ static bool whisper_decode_internal(
|
||||
// no masking for cross-attention
|
||||
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
wstate.use_buf(ctx0, 0);
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
|
||||
wstate.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
@ -2190,10 +2187,10 @@ static bool whisper_decode_internal(
|
||||
if (N > 1) {
|
||||
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
||||
// ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
// wctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wctx.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
}
|
||||
|
||||
ggml_free(ctx0);
|
||||
@ -2301,72 +2298,6 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
|
||||
}
|
||||
}
|
||||
|
||||
static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> &hann, const float *samples,
|
||||
int n_samples, int fft_size, int fft_step, int n_threads,
|
||||
const whisper_filters &filters, bool speed_up, whisper_mel &mel) {
|
||||
std::vector<float> fft_in(fft_size, 0.0);
|
||||
std::vector<float> fft_out(2 * fft_size);
|
||||
int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2);
|
||||
|
||||
for (int i = ith; i < mel.n_len; i += n_threads) {
|
||||
const int offset = i * fft_step;
|
||||
|
||||
// apply Hanning window
|
||||
for (int j = 0; j < fft_size; j++) {
|
||||
if (offset + j < n_samples) {
|
||||
fft_in[j] = hann[j] * samples[offset + j];
|
||||
} else {
|
||||
fft_in[j] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// FFT -> mag^2
|
||||
fft(fft_in, fft_out);
|
||||
|
||||
for (int j = 0; j < fft_size; j++) {
|
||||
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
||||
}
|
||||
for (int j = 1; j < fft_size / 2; j++) {
|
||||
fft_out[j] += fft_out[fft_size - j];
|
||||
}
|
||||
|
||||
if (speed_up) {
|
||||
// scale down in the frequency domain results in a speed up in the time domain
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int j = 0; j < mel.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
|
||||
// unroll loop (suggested by GH user @lunixbochs)
|
||||
int k = 0;
|
||||
for (k = 0; k < n_fft - 3; k += 4) {
|
||||
sum +=
|
||||
fft_out[k + 0] * filters.data[j*n_fft + k + 0] +
|
||||
fft_out[k + 1] * filters.data[j*n_fft + k + 1] +
|
||||
fft_out[k + 2] * filters.data[j*n_fft + k + 2] +
|
||||
fft_out[k + 3] * filters.data[j*n_fft + k + 3];
|
||||
}
|
||||
|
||||
// handle n_fft remainder
|
||||
for (; k < n_fft; k++) {
|
||||
sum += fft_out[k] * filters.data[j * n_fft + k];
|
||||
}
|
||||
|
||||
if (sum < 1e-10) {
|
||||
sum = 1e-10;
|
||||
}
|
||||
|
||||
sum = log10(sum);
|
||||
|
||||
mel.data[j * mel.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
|
||||
static bool log_mel_spectrogram(
|
||||
whisper_state & wstate,
|
||||
@ -2389,48 +2320,85 @@ static bool log_mel_spectrogram(
|
||||
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
|
||||
}
|
||||
|
||||
mel.n_mel = n_mel;
|
||||
mel.n_len = n_samples/fft_step;
|
||||
mel.n_len_org = mel.n_len;
|
||||
|
||||
std::vector<float> samples_padded;
|
||||
|
||||
// pad audio with at least one extra chunk of zeros
|
||||
{
|
||||
const int pad = (100*WHISPER_CHUNK_SIZE)/2;
|
||||
|
||||
if (mel.n_len % pad != 0) {
|
||||
mel.n_len = (mel.n_len/pad + 1)*pad;
|
||||
}
|
||||
mel.n_len += pad;
|
||||
|
||||
samples_padded.resize(mel.n_len*fft_step);
|
||||
memcpy(samples_padded.data(), samples, n_samples*sizeof(float));
|
||||
memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
|
||||
|
||||
samples = samples_padded.data();
|
||||
}
|
||||
|
||||
mel.n_mel = n_mel;
|
||||
mel.n_len = (n_samples)/fft_step;
|
||||
mel.data.resize(mel.n_mel*mel.n_len);
|
||||
|
||||
const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
|
||||
|
||||
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
||||
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
||||
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw] = std::thread(
|
||||
log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples,
|
||||
n_samples, fft_size, fft_step, n_threads,
|
||||
std::cref(filters), speed_up, std::ref(mel));
|
||||
}
|
||||
std::vector<std::thread> workers(n_threads);
|
||||
for (int iw = 0; iw < n_threads; ++iw) {
|
||||
workers[iw] = std::thread([&](int ith) {
|
||||
std::vector<float> fft_in;
|
||||
fft_in.resize(fft_size);
|
||||
for (int i = 0; i < fft_size; i++) {
|
||||
fft_in[i] = 0.0;
|
||||
}
|
||||
|
||||
// main thread
|
||||
log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, fft_size, fft_step, n_threads, filters, speed_up, mel);
|
||||
std::vector<float> fft_out;
|
||||
fft_out.resize(2*fft_size);
|
||||
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw].join();
|
||||
}
|
||||
for (int i = ith; i < mel.n_len; i += n_threads) {
|
||||
const int offset = i*fft_step;
|
||||
|
||||
// apply Hanning window
|
||||
for (int j = 0; j < fft_size; j++) {
|
||||
if (offset + j < n_samples) {
|
||||
fft_in[j] = hann[j]*samples[offset + j];
|
||||
} else {
|
||||
fft_in[j] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// FFT -> mag^2
|
||||
fft(fft_in, fft_out);
|
||||
|
||||
for (int j = 0; j < fft_size; j++) {
|
||||
fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
|
||||
}
|
||||
for (int j = 1; j < fft_size/2; j++) {
|
||||
//if (i == 0) {
|
||||
// printf("%d: %f %f\n", j, fft_out[j], fft_out[fft_size - j]);
|
||||
//}
|
||||
fft_out[j] += fft_out[fft_size - j];
|
||||
}
|
||||
if (i == 0) {
|
||||
//for (int j = 0; j < fft_size; j++) {
|
||||
// printf("%d: %e\n", j, fft_out[j]);
|
||||
//}
|
||||
}
|
||||
|
||||
if (speed_up) {
|
||||
// scale down in the frequency domain results in a speed up in the time domain
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int j = 0; j < mel.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
|
||||
for (int k = 0; k < n_fft; k++) {
|
||||
sum += fft_out[k]*filters.data[j*n_fft + k];
|
||||
}
|
||||
if (sum < 1e-10) {
|
||||
sum = 1e-10;
|
||||
}
|
||||
|
||||
sum = log10(sum);
|
||||
|
||||
mel.data[j*mel.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}, iw);
|
||||
}
|
||||
|
||||
for (int iw = 0; iw < n_threads; ++iw) {
|
||||
workers[iw].join();
|
||||
}
|
||||
|
||||
// clamping and normalization
|
||||
@ -2454,8 +2422,6 @@ static bool log_mel_spectrogram(
|
||||
|
||||
wstate.t_mel_us += ggml_time_us() - t_start_us;
|
||||
|
||||
//printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2497,20 +2463,25 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
bool found = false;
|
||||
while (j > i) {
|
||||
auto sub = word.substr(i, j-i);
|
||||
auto it = vocab.token_to_id.find(sub);
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (!found) {
|
||||
fprintf(stderr, "unknown token \n");
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
@ -2524,14 +2495,14 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
||||
//
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
// replace .bin with -encoder.mlmodelc
|
||||
static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
||||
// replace .bin with .mlmodelc
|
||||
static std::string whisper_get_coreml_path(std::string path_bin) {
|
||||
auto pos = path_bin.rfind('.');
|
||||
if (pos != std::string::npos) {
|
||||
path_bin = path_bin.substr(0, pos);
|
||||
}
|
||||
|
||||
path_bin += "-encoder.mlmodelc";
|
||||
path_bin += ".mlmodelc";
|
||||
|
||||
return path_bin;
|
||||
}
|
||||
@ -2544,7 +2515,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
|
||||
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
delete state;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -2555,7 +2525,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
|
||||
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
||||
delete state;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -2565,18 +2534,18 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
}
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
|
||||
const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
|
||||
|
||||
fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
||||
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
|
||||
fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
||||
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
|
||||
|
||||
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
||||
if (!state->ctx_coreml) {
|
||||
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
||||
if (!state->ctx_coreml) {
|
||||
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
|
||||
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
|
||||
#endif
|
||||
|
||||
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
||||
@ -2746,11 +2715,6 @@ void whisper_free_state(struct whisper_state * state)
|
||||
kv_cache_free(state->decoders[i].kv_self);
|
||||
}
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
whisper_coreml_free(state->ctx_coreml);
|
||||
state->ctx_coreml = nullptr;
|
||||
#endif
|
||||
|
||||
delete state;
|
||||
}
|
||||
}
|
||||
@ -2766,6 +2730,10 @@ void whisper_free(struct whisper_context * ctx) {
|
||||
|
||||
whisper_free_state(ctx->state);
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
whisper_coreml_free(ctx->state->ctx_coreml);
|
||||
ctx->state->ctx_coreml = nullptr;
|
||||
#endif
|
||||
delete ctx;
|
||||
}
|
||||
}
|
||||
@ -2809,9 +2777,8 @@ int whisper_set_mel_with_state(
|
||||
return -1;
|
||||
}
|
||||
|
||||
state->mel.n_len = n_len;
|
||||
state->mel.n_len_org = n_len;
|
||||
state->mel.n_mel = n_mel;
|
||||
state->mel.n_len = n_len;
|
||||
state->mel.n_mel = n_mel;
|
||||
|
||||
state->mel.data.resize(n_len*n_mel);
|
||||
memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
|
||||
@ -2937,8 +2904,8 @@ int whisper_lang_auto_detect_with_state(
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (seek >= state->mel.n_len_org) {
|
||||
fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
|
||||
if (seek >= state->mel.n_len) {
|
||||
fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len*10);
|
||||
return -2;
|
||||
}
|
||||
|
||||
@ -3073,11 +3040,11 @@ const char *whisper_model_type_readable(struct whisper_context * ctx) {
|
||||
}
|
||||
|
||||
int whisper_n_len_from_state(struct whisper_state * state) {
|
||||
return state->mel.n_len_org;
|
||||
return state->mel.n_len;
|
||||
}
|
||||
|
||||
int whisper_n_len(struct whisper_context * ctx) {
|
||||
return ctx->state->mel.n_len_org;
|
||||
return ctx->state->mel.n_len;
|
||||
}
|
||||
|
||||
int whisper_n_vocab(struct whisper_context * ctx) {
|
||||
@ -3173,14 +3140,6 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
static int whisper_has_coreml(void) {
|
||||
#ifdef WHISPER_USE_COREML
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
const char * whisper_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
@ -3197,7 +3156,6 @@ const char * whisper_print_system_info(void) {
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
@ -3231,7 +3189,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.speed_up =*/ false,
|
||||
/*.audio_ctx =*/ 0,
|
||||
|
||||
/*.initial_prompt =*/ nullptr,
|
||||
/*.prompt_tokens =*/ nullptr,
|
||||
/*.prompt_n_tokens =*/ 0,
|
||||
|
||||
@ -3244,7 +3201,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.max_initial_ts =*/ 1.0f,
|
||||
/*.length_penalty =*/ -1.0f,
|
||||
|
||||
/*.temperature_inc =*/ 0.4f,
|
||||
/*.temperature_inc =*/ 0.0f, // TODO: temporary disabled until improve performance
|
||||
/*.entropy_thold =*/ 2.4f,
|
||||
/*.logprob_thold =*/ -1.0f,
|
||||
/*.no_speech_thold =*/ 0.6f,
|
||||
@ -3262,9 +3219,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.new_segment_callback =*/ nullptr,
|
||||
/*.new_segment_callback_user_data =*/ nullptr,
|
||||
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
|
||||
/*.encoder_begin_callback =*/ nullptr,
|
||||
/*.encoder_begin_callback_user_data =*/ nullptr,
|
||||
|
||||
@ -3276,13 +3230,13 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
case WHISPER_SAMPLING_GREEDY:
|
||||
{
|
||||
result.greedy = {
|
||||
/*.best_of =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
|
||||
/*.best_of =*/ 1,
|
||||
};
|
||||
} break;
|
||||
case WHISPER_SAMPLING_BEAM_SEARCH:
|
||||
{
|
||||
result.beam_search = {
|
||||
/*.beam_size =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
|
||||
/*.beam_size =*/ 5,
|
||||
|
||||
/*.patience =*/ -1.0f,
|
||||
};
|
||||
@ -3907,26 +3861,13 @@ int whisper_full_with_state(
|
||||
prompt_past.clear();
|
||||
}
|
||||
|
||||
// prepare prompt
|
||||
{
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
// initial prompt
|
||||
if (!params.prompt_tokens && params.initial_prompt) {
|
||||
prompt_tokens.resize(1024);
|
||||
prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
|
||||
params.prompt_tokens = prompt_tokens.data();
|
||||
params.prompt_n_tokens = prompt_tokens.size();
|
||||
}
|
||||
|
||||
// prepend the prompt tokens to the prompt_past
|
||||
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
||||
// parse tokens from the pointer
|
||||
for (int i = 0; i < params.prompt_n_tokens; i++) {
|
||||
prompt_past.push_back(params.prompt_tokens[i]);
|
||||
}
|
||||
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
|
||||
// prepend the prompt tokens to the prompt_past
|
||||
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
||||
// parse tokens from the pointer
|
||||
for (int i = 0; i < params.prompt_n_tokens; i++) {
|
||||
prompt_past.push_back(params.prompt_tokens[i]);
|
||||
}
|
||||
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
|
||||
}
|
||||
|
||||
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
|
||||
@ -3985,10 +3926,6 @@ int whisper_full_with_state(
|
||||
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
|
||||
}
|
||||
}
|
||||
if (params.progress_callback) {
|
||||
params.progress_callback(
|
||||
ctx, ctx->state, progress_prev, params.progress_callback_user_data);
|
||||
}
|
||||
|
||||
// of only 1 second left, then stop
|
||||
if (seek + 100 >= seek_end) {
|
||||
@ -4378,11 +4315,7 @@ int whisper_full_with_state(
|
||||
}
|
||||
|
||||
// was the decoding successful for the current temperature?
|
||||
// do fallback only if:
|
||||
// - we are not at the last temperature
|
||||
// - we are not at the end of the audio (3 sec)
|
||||
if (it != (int) temperatures.size() - 1 &&
|
||||
seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
|
||||
{
|
||||
bool success = true;
|
||||
|
||||
const auto & decoder = state->decoders[best_decoder_id];
|
||||
@ -4581,9 +4514,6 @@ int whisper_full_parallel(
|
||||
params_cur.new_segment_callback = nullptr;
|
||||
params_cur.new_segment_callback_user_data = nullptr;
|
||||
|
||||
params_cur.progress_callback = nullptr;
|
||||
params_cur.progress_callback_user_data = nullptr;
|
||||
|
||||
workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
|
||||
}
|
||||
|
||||
@ -4844,7 +4774,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
||||
struct ggml_init_params gparams = {
|
||||
/*.mem_size =*/ buf.size(),
|
||||
/*.mem_buffer =*/ buf.data(),
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(gparams);
|
||||
|
12
whisper.h
12
whisper.h
@ -226,7 +226,7 @@ extern "C" {
|
||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
||||
// Returns the top language id or negative on failure
|
||||
// If not null, fills the lang_probs array with the probabilities of all languages
|
||||
// The array must be whisper_lang_max_id() + 1 in size
|
||||
// The array must be whispe_lang_max_id() + 1 in size
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
||||
WHISPER_API int whisper_lang_auto_detect(
|
||||
struct whisper_context * ctx,
|
||||
@ -297,7 +297,7 @@ extern "C" {
|
||||
|
||||
// Available sampling strategies
|
||||
enum whisper_sampling_strategy {
|
||||
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
|
||||
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
|
||||
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
|
||||
};
|
||||
|
||||
@ -306,9 +306,6 @@ extern "C" {
|
||||
// Use the whisper_full_...() functions to obtain the text segments
|
||||
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
|
||||
|
||||
// Progress callback
|
||||
typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
|
||||
|
||||
// Encoder begin callback
|
||||
// If not NULL, called before the encoder starts
|
||||
// If it returns false, the computation is aborted
|
||||
@ -359,7 +356,6 @@ extern "C" {
|
||||
|
||||
// tokens to provide to the whisper decoder as initial prompt
|
||||
// these are prepended to any existing text context from a previous call
|
||||
const char * initial_prompt;
|
||||
const whisper_token * prompt_tokens;
|
||||
int prompt_n_tokens;
|
||||
|
||||
@ -395,10 +391,6 @@ extern "C" {
|
||||
whisper_new_segment_callback new_segment_callback;
|
||||
void * new_segment_callback_user_data;
|
||||
|
||||
// called on each progress update
|
||||
whisper_progress_callback progress_callback;
|
||||
void * progress_callback_user_data;
|
||||
|
||||
// called each time before the encoder starts
|
||||
whisper_encoder_begin_callback encoder_begin_callback;
|
||||
void * encoder_begin_callback_user_data;
|
||||
|
Reference in New Issue
Block a user